diff --git a/conf.py b/conf.py index 3a1548d..bdeceb1 100644 --- a/conf.py +++ b/conf.py @@ -42,6 +42,7 @@ 'sphinx.ext.intersphinx', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', + 'sphinx_issues', ] # Add any paths that contain templates here, relative to this directory. @@ -165,3 +166,7 @@ # -- Options for intersphinx extension --------------------------------------- intersphinx_mapping = {'sklearn': ('http://scikit-learn.org/stable', None)} + +# -- Sphinx-Issues configuration -- + +issues_github_path = "scikit-learn/scikit-learn" diff --git a/index.rst b/index.rst index a68713e..e5f5718 100644 --- a/index.rst +++ b/index.rst @@ -29,6 +29,7 @@ slep002/proposal slep003/proposal slep004/proposal + slep006/proposal .. toctree:: :maxdepth: 1 diff --git a/requirements.txt b/requirements.txt index cbf1e36..5666abb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ sphinx sphinx-rtd-theme +sphinx-issues diff --git a/slep006/cases_opt0a.py b/slep006/cases_opt0a.py new file mode 100644 index 0000000..d0141fa --- /dev/null +++ b/slep006/cases_opt0a.py @@ -0,0 +1,6 @@ +from defs import (accuracy, group_cv, make_scorer, SelectKBest, + LogisticRegressionCV, cross_validate, + make_pipeline, X, y, my_groups, my_weights, + my_other_weights) + +# TODO diff --git a/slep006/cases_opt0b.py b/slep006/cases_opt0b.py new file mode 100644 index 0000000..f543e9b --- /dev/null +++ b/slep006/cases_opt0b.py @@ -0,0 +1,7 @@ +import pandas as pd +from defs import (accuracy, group_cv, make_scorer, SelectKBest, + LogisticRegressionCV, cross_validate, + make_pipeline, X, y, my_groups, my_weights, + my_other_weights) + +# TODO diff --git a/slep006/cases_opt1.py b/slep006/cases_opt1.py new file mode 100644 index 0000000..a8185d3 --- /dev/null +++ b/slep006/cases_opt1.py @@ -0,0 +1,68 @@ +from defs import (accuracy, group_cv, make_scorer, SelectKBest, + LogisticRegressionCV, cross_validate, make_pipeline, X, y, + my_groups, my_weights, my_other_weights) + +# %% +# Case A: weighted scoring and fitting + +lr = LogisticRegressionCV( + cv=group_cv, + scoring='accuracy', +) +cross_validate(lr, X, y, cv=group_cv, + props={'sample_weight': my_weights, 'groups': my_groups}, + scoring='accuracy') + +# Error handling: if props={'sample_eight': my_weights, ...} was passed +# instead, the estimator would fit and score without weight, silently failing. + +# %% +# Case B: weighted scoring and unweighted fitting + + +class MyLogisticRegressionCV(LogisticRegressionCV): + def fit(self, X, y, props=None): + props = props.copy() + props.pop('sample_weight', None) + super().fit(X, y, props=props) + + +# %% +# Case C: unweighted feature selection + +# Currently feature selection does not handle sample_weight, and as long as +# that remains the case, it will simply ignore the prop passed to it. Hence: + +lr = LogisticRegressionCV( + cv=group_cv, + scoring='accuracy', +) +sel = SelectKBest() +pipe = make_pipeline(sel, lr) +cross_validate(pipe, X, y, cv=group_cv, + props={'sample_weight': my_weights, 'groups': my_groups}, + scoring='accuracy') + +# %% +# Case D: different scoring and fitting weights + +weighted_acc = make_scorer(accuracy) + + +def specially_weighted_acc(est, X, y, props): + props = props.copy() + props['sample_weight'] = 'scoring_weight' + return weighted_acc(est, X, y, props) + + +lr = LogisticRegressionCV( + cv=group_cv, + scoring=specially_weighted_acc, +) +cross_validate(lr, X, y, cv=group_cv, + props={ + 'scoring_weight': my_weights, + 'sample_weight': my_other_weights, + 'groups': my_groups, + }, + scoring=specially_weighted_acc) diff --git a/slep006/cases_opt2.py b/slep006/cases_opt2.py new file mode 100644 index 0000000..4148e66 --- /dev/null +++ b/slep006/cases_opt2.py @@ -0,0 +1,70 @@ +from defs import (group_cv, SelectKBest, LogisticRegressionCV, + cross_validate, make_pipeline, X, y, my_groups, + my_weights, my_other_weights) + +# %% +# Case A: weighted scoring and fitting + +lr = LogisticRegressionCV( + cv=group_cv, + scoring='accuracy', +) +props = {'cv__groups': my_groups, + 'estimator__cv__groups': my_groups, + 'estimator__sample_weight': my_weights, + 'scoring__sample_weight': my_weights, + 'estimator__scoring__sample_weight': my_weights} +cross_validate(lr, X, y, cv=group_cv, + props=props, + scoring='accuracy') + +# error handling: if props={'estimator__sample_eight': my_weights, ...} was +# passed instead, the estimator would raise an error. + +# %% +# Case B: weighted scoring and unweighted fitting + +lr = LogisticRegressionCV( + cv=group_cv, + scoring='accuracy', +) +props = {'cv__groups': my_groups, + 'estimator__cv__groups': my_groups, + 'scoring__sample_weight': my_weights, + 'estimator__scoring__sample_weight': my_weights} +cross_validate(lr, X, y, cv=group_cv, + props=props, + scoring='accuracy') + +# %% +# Case C: unweighted feature selection + +lr = LogisticRegressionCV( + cv=group_cv, + scoring='accuracy', +) +pipe = make_pipeline(SelectKBest(), lr) +props = {'cv__groups': my_groups, + 'estimator__logisticregressioncv__cv__groups': my_groups, + 'estimator__logisticregressioncv__sample_weight': my_weights, + 'scoring__sample_weight': my_weights, + 'estimator__scoring__sample_weight': my_weights} +cross_validate(pipe, X, y, cv=group_cv, + props=props, + scoring='accuracy') + +# %% +# Case D: different scoring and fitting weights + +lr = LogisticRegressionCV( + cv=group_cv, + scoring='accuracy', +) +props = {'cv__groups': my_groups, + 'estimator__cv__groups': my_groups, + 'estimator__sample_weight': my_other_weights, + 'scoring__sample_weight': my_weights, + 'estimator__scoring__sample_weight': my_weights} +cross_validate(lr, X, y, cv=group_cv, + props=props, + scoring='accuracy') diff --git a/slep006/cases_opt3.py b/slep006/cases_opt3.py new file mode 100644 index 0000000..5b4b450 --- /dev/null +++ b/slep006/cases_opt3.py @@ -0,0 +1,99 @@ +from defs import (accuracy, make_scorer, SelectKBest, LogisticRegressionCV, + group_cv, cross_validate, make_pipeline, X, y, my_groups, + my_weights, my_other_weights) + +# %% +# Case A: weighted scoring and fitting + +lr = LogisticRegressionCV( + cv=group_cv, + scoring='accuracy', + prop_routing={'cv': ['groups'], + 'scoring': ['sample_weight'], + } + # one question here is whether we need to explicitly route sample_weight + # to LogisticRegressionCV's fitting... +) + +# Alternative syntax, which assumes cv receives 'groups' by default, and that a +# method-based API is provided on meta-estimators: +# lr = LogisticRegressionCV( +# cv=group_cv, +# scoring='accuracy', +# ).add_prop_route(scoring='sample_weight') + +cross_validate(lr, X, y, cv=group_cv, + props={'sample_weight': my_weights, 'groups': my_groups}, + scoring='accuracy', + prop_routing={'estimator': '*', # pass all props + 'cv': ['groups'], + 'scoring': ['sample_weight'], + }) + +# Error handling: if props={'sample_eight': my_weights, ...} was passed +# instead, LogisticRegressionCV would have to identify that a key was passed +# that could not be routed nor used, in order to raise an error. + +# %% +# Case B: weighted scoring and unweighted fitting + +# Here we rename the sample_weight prop so that we can specify that it only +# applies to scoring. +lr = LogisticRegressionCV( + cv=group_cv, + scoring='accuracy', + prop_routing={'cv': ['groups'], + # read the following as "scoring should consume + # 'scoring_weight' as if it were 'sample_weight'." + 'scoring': {'sample_weight': 'scoring_weight'}, + }, +) +cross_validate(lr, X, y, cv=group_cv, + props={'scoring_weight': my_weights, 'groups': my_groups}, + scoring='accuracy', + prop_routing={'estimator': '*', + 'cv': ['groups'], + 'scoring': {'sample_weight': 'scoring_weight'}, + }) + +# %% +# Case C: unweighted feature selection + +lr = LogisticRegressionCV( + cv=group_cv, + scoring='accuracy', + prop_routing={'cv': ['groups'], + 'scoring': ['sample_weight'], + }) +pipe = make_pipeline(SelectKBest(), lr, + prop_routing={'logisticregressioncv': ['sample_weight', + 'groups']}) +cross_validate(lr, X, y, cv=group_cv, + props={'sample_weight': my_weights, 'groups': my_groups}, + scoring='accuracy', + prop_routing={'estimator': '*', + 'cv': ['groups'], + 'scoring': ['sample_weight'], + }) + +# %% +# Case D: different scoring and fitting weights +lr = LogisticRegressionCV( + cv=group_cv, + scoring='accuracy', + prop_routing={'cv': ['groups'], + # read the following as "scoring should consume + # 'scoring_weight' as if it were 'sample_weight'." + 'scoring': {'sample_weight': 'scoring_weight'}, + }, +) +cross_validate(lr, X, y, cv=group_cv, + props={'scoring_weight': my_weights, 'groups': my_groups, + 'fitting_weight': my_other_weights}, + scoring='accuracy', + prop_routing={'estimator': {'sample_weight': 'fitting_weight', + 'scoring_weight': 'scoring_weight', + 'groups': 'groups'}, + 'cv': ['groups'], + 'scoring': {'sample_weight': 'scoring_weight'}, + }) diff --git a/slep006/cases_opt4.py b/slep006/cases_opt4.py new file mode 100644 index 0000000..1d1325c --- /dev/null +++ b/slep006/cases_opt4.py @@ -0,0 +1,78 @@ +from defs import (accuracy, group_cv, make_scorer, SelectKBest, + LogisticRegressionCV, cross_validate, + make_pipeline, X, y, my_groups, my_weights, + my_other_weights) + +# %% +# Case A: weighted scoring and fitting + +# Here we presume that GroupKFold requests `groups` by default. +# We need to explicitly request weights in make_scorer and for +# LogisticRegressionCV. Both of these consumers understand the meaning +# of the key "sample_weight". + +weighted_acc = make_scorer(accuracy, request_props=['sample_weight']) +lr = LogisticRegressionCV( + cv=group_cv, + scoring=weighted_acc, +).set_props_request(['sample_weight']) +cross_validate(lr, X, y, cv=group_cv, + props={'sample_weight': my_weights, 'groups': my_groups}, + scoring=weighted_acc) + +# Error handling: if props={'sample_eight': my_weights, ...} was passed, +# cross_validate would raise an error, since 'sample_eight' was not requested +# by any of its children. + +# %% +# Case B: weighted scoring and unweighted fitting + +# Since LogisticRegressionCV requires that weights explicitly be requested, +# removing that request means the fitting is unweighted. + +weighted_acc = make_scorer(accuracy, request_props=['sample_weight']) +lr = LogisticRegressionCV( + cv=group_cv, + scoring=weighted_acc, +) +cross_validate(lr, X, y, cv=group_cv, + props={'sample_weight': my_weights, 'groups': my_groups}, + scoring=weighted_acc) + +# %% +# Case C: unweighted feature selection + +# Like LogisticRegressionCV, SelectKBest needs to request weights explicitly. +# Here it does not request them. + +weighted_acc = make_scorer(accuracy, request_props=['sample_weight']) +lr = LogisticRegressionCV( + cv=group_cv, + scoring=weighted_acc, +).set_props_request(['sample_weight']) +sel = SelectKBest() +pipe = make_pipeline(sel, lr) +cross_validate(pipe, X, y, cv=group_cv, + props={'sample_weight': my_weights, 'groups': my_groups}, + scoring=weighted_acc) + +# %% +# Case D: different scoring and fitting weights + +# Despite make_scorer and LogisticRegressionCV both expecting a key +# sample_weight, we can use aliases to pass different weights to different +# consumers. + +weighted_acc = make_scorer(accuracy, + request_props={'scoring_weight': 'sample_weight'}) +lr = LogisticRegressionCV( + cv=group_cv, + scoring=weighted_acc, +).set_props_request({'fitting_weight': "sample_weight"}) +cross_validate(lr, X, y, cv=group_cv, + props={ + 'scoring_weight': my_weights, + 'fitting_weight': my_other_weights, + 'groups': my_groups, + }, + scoring=weighted_acc) diff --git a/slep006/defs.py b/slep006/defs.py new file mode 100644 index 0000000..2026c8e --- /dev/null +++ b/slep006/defs.py @@ -0,0 +1,14 @@ +import numpy as np +from sklearn.feature_selection import SelectKBest +from sklearn.linear_model import LogisticRegressionCV +from sklearn.metrics import accuracy +from sklearn.metrics import make_scorer +from sklearn.model_selection import GroupKFold, cross_validate +from sklearn.pipeline import make_pipeline + +N, M = 100, 4 +X = np.random.rand(N, M) +y = np.random.randint(0, 1, size=N) +my_groups = np.random.randint(0, 10, size=N) +my_weights = np.random.rand(N) +my_other_weights = np.random.rand(N) diff --git a/slep006/proposal.rst b/slep006/proposal.rst new file mode 100644 index 0000000..c336e4a --- /dev/null +++ b/slep006/proposal.rst @@ -0,0 +1,445 @@ +.. _slep_006: + +================================ +Routing sample-aligned meta-data +================================ + +:Author: Joel Nothman +:Status: Draft +:Type: Standards Track +:Created: 2019-03-07 + + +Scikit-learn has limited support for information pertaining to each sample +(henceforth "sample properties") to be passed through an estimation pipeline. +The user can, for instance, pass fit parameters to all members of a +FeatureUnion, or to a specified member of a Pipeline using dunder (``__``) +prefixing:: + + >>> from sklearn.pipeline import Pipeline + >>> from sklearn.linear_model import LogisticRegression + >>> pipe = Pipeline([('clf', LogisticRegression())]) + >>> pipe.fit([[1, 2], [3, 4]], [5, 6], + ... clf__sample_weight=[.5, .7]) # doctest: +SKIP + +Several other meta-estimators, such as GridSearchCV, support forwarding these +fit parameters to their base estimator when fitting. + +Desirable features we do not currently support include: + +* passing sample properties (e.g. `sample_weight`) to a scorer used in + cross-validation +* passing sample properties (e.g. `groups`) to a CV splitter in nested cross + validation +* (maybe in scope) passing sample properties (e.g. `sample_weight`) to some + scorers and not others in a multi-metric cross-validation setup +* (likley out of scope) passing sample properties to non-fit methods, for + instance to index grouped samples that are to be treated as a single sequence + in prediction. + +Definitions +----------- + +consumer + An estimator, scorer, splitter, etc., that receives and can make use of + one or more passed props. +key + A label passed along with sample prop data to indicate how it should be + interpreted (e.g. "weight"). +router + An estimator or function that passes props on to some other router or + consumer, potentially selecting which props to pass to which destination, + and by what key. + +History +------- + +This version was drafted after a discussion of the issue and potential +solutions at the February 2019 development sprint in Paris. + +Supersedes `SLEP004 +`_ +with greater depth of desiderata and options. + +Primary related issues and pull requests include: + +- :issue:`4497`: Overarching issue, + "Consistent API for attaching properties to samples" + by :user:`GaelVaroquaux` +- :pr:`4696` A first implementation by :user:`amueller` +- `Discussion towards SLEP004 + `__ initiated + by :user:`tguillemot` +- :pr:`9566` Another implementation (solution 3 from this SLEP) + by :user:`jnothman` +- :pr:`16079` Another implementation (solution 4 from this SLEP) + by :user:`adrinjalali` + +Other related issues include: :issue:`1574`, :issue:`2630`, :issue:`3524`, +:issue:`4632`, :issue:`4652`, :issue:`4660`, :issue:`4696`, :issue:`6322`, +:issue:`7112`, :issue:`7646`, :issue:`7723`, :issue:`8127`, :issue:`8158`, +:issue:`8710`, :issue:`8950`, :issue:`11429`, :issue:`12052`, :issue:`15282`, +:issues:`15370`, :issue:`15425`. + +Desiderata +---------- + +We will consider the following aspects to develop and compare solutions: + +Usability + Can the use cases be achieved in succinct, readable code? Can common use + cases be achieved with a simple recipe copy-pasted from a QA forum? +Brittleness + If a property is being routed through a Pipeline, does changing the + structure of the pipeline (e.g. adding a layer of nesting) require rewriting + other code? +Error handling + If the user mistypes the name of a sample property, or misspecifies how it + should be routed to a consumer, will an appropriate exception be raised? +Impact on meta-estimator design + How much meta-estimator code needs to change? How hard will it be to + maintain? +Impact on estimator design + How much will the proposal affect estimator developers? +Backwards compatibility + Can existing behavior be maintained? +Forwards compatibility + Is the solution going to make users' code more + brittle with future changes? (For example, will a user's pipeline change + behaviour radically when sample_weight is implemented on some estimator) +Introspection + If sensible to do so (e.g. for improved efficiency), can a + meta-estimator identify whether its base estimator (recursively) would + handle some particular sample property (e.g. so a meta-estimator can choose + between weighting and resampling, or for automated invariance testing)? + +Keyword arguments vs. a single argument +--------------------------------------- + +Currently, sample properties are provided as keyword arguments to a `fit` +method. In redeveloping sample properties, we can instead accept a single +parameter (named `props` or `sample_props` or `etc`, for example) which maps +string keys to arrays of the same length (a "DataFrame-like"). + +Keyword arguments:: + + >>> gs.fit(X, y, groups=groups, sample_weight=sample_weight) + +Single argument:: + + >>> gs.fit(X, y, prop={'groups': groups, 'weight': weight}) + +While drafting this document, we will assume the latter notation for clarity. + +Advantages of multiple keyword arguments: + +* succinct +* possible to maintain backwards compatible support for sample_weight, etc. +* we do not need to handle cases for whether or not some estimator expects a + `props` argument. + +Advantages of a single argument: + +* we are able to consider kwargs to `fit` that are not sample-aligned, so that + we can add further functionality (some that have been proposed: + `with_warm_start`, `feature_names_in`, `feature_meta`). +* we are able to redefine the default routing of weights etc. without being + concerned by backwards compatibility. +* we can consider the use of keys that are not limited to strings or valid + identifiers (and hence are not limited to using ``_`` as a delimiter). + +Test case setup +--------------- + +Case A +~~~~~~ + +Cross-validate a ``LogisticRegressionCV(cv=GroupKFold(), scoring='accuracy')`` +with weighted scoring and weighted fitting. + +Error handling: what would happen if the user misspelled `sample_weight` as +`sample_eight`? + +Case B +~~~~~~ + +Cross-validate a ``LogisticRegressionCV(cv=GroupKFold(), scoring='accuracy')`` +with weighted scoring and unweighted fitting. + +Case C +~~~~~~ + +Extend Case A to apply an unweighted univariate feature selector in a +``Pipeline``. + +Case D +~~~~~~ + +Different weights for scoring and for fitting in Case A. + +TODO: case involving props passed at test time, e.g. to pipe.transform (???). +TODO: case involving score() method, e.g. not specifying scoring in +cross_val_score when wrapping an estimator with weighted score func ... + +Solution sketches will import these definitions: + +.. literalinclude:: defs.py + +Status quo solution 0a: additional feature +------------------------------------------ + +Without changing scikit-learn, the following hack can be used: + +Additional numeric features representing sample props can be appended to the +data and passed around, being handled specially in each consumer of features +or sample props. + +.. literalinclude:: cases_opt0a.py + +Status quo solution 0b: Pandas Index and global resources +--------------------------------------------------------- + +Without changing scikit-learn, the following hack can be used: + +If `y` is represented with a Pandas datatype, then its index can be used to +access required elements from props stored in a global namespace (or otherwise +made available to the estimator before fitting). This is possible everywhere +that a gold-standard `y` is passed, including fit, split and score. A similar +solution with `X` is also possible for handling predict-time props, if all +Pipeline components retain the original Pandas Index. + +Issues: + +* use of global data source +* requires Pandas data types and indices to be maintained + +.. literalinclude:: cases_opt0b.py + +Solution 1: Pass everything +--------------------------- + +This proposal passes all props to all consumers (estimators, splitters, +scorers, etc). The consumer would optionally use props it is familiar with by +name and disregard other props. + +We may consider providing syntax for the user to control the interpretation of +incoming props: + +* to require that some prop is provided (for an estimator where that prop is + otherwise optional) +* to disregard some provided prop +* to treat a particular prop key as having a certain meaning (e.g. locally + interpreting 'scoring_sample_weight' as 'sample_weight'). + +These constraints would be checked by calling a helper at the consumer. + +Issues: + +* Error handling: if a key is optional in a consumer, no error will be + raised for misspelling. An introspection API might change this, allowing a + user or meta-estimator to check if all keys passed are to be used in at least + one consumer. +* Forwards compatibility: newly supporting a prop key in a consumer will change + behaviour. Other than a ChangedBehaviorWarning, I don't see any way around + this. +* Introspection: not inherently supported. Would need an API like + ``get_prop_support(names: List[str]) -> Dict[str, Literal["supported", "required", "ignored"]]``. + +In short, this is a simple solution, but prone to risk. + +.. literalinclude:: cases_opt1.py + + +Solution 2: Specify routes at call +---------------------------------- + +Similar to the legacy behavior of fit parameters in +:class:`sklearn.pipeline.Pipeline`, this requires the user to specify the +path for each "prop" to follow when calling `fit`. For example, to pass +a prop named 'weights' to a step named 'spam' in a Pipeline, you might use +`my_pipe.fit(X, y, props={'spam__weights': my_weights})`. + +SLEP004's syntax to override the common routing scheme falls under this +solution. + +Advantages: + +* Very explicit and robust to misspellings. + +Issues: + +* The user needs to know the deep internal structure, or it is easy to fail to + pass a prop to a specific estimator. +* A corollary is that prop keys need changing when the developer modifies their + estimator structure (see case C). +* This gets especially tricky or impossible where the available routes + change mid-fit, such as where a grid search considers estimators with + different structures. +* We would need to find a different solution for :issue:`2630` where a Pipeline + could not be the base estimator of AdaBoost because AdaBoost expects the base + estimator to accept a fit param keyed 'sample_weight'. +* This may not work if a meta-estimator were to have the role of changing a + prop, e.g. a meta-estimator that passes `sample_weight` corresponding to + balanced classes onto its base estimator. The meta-estimator would need a + list of destinations to pass modified props to, or a list of keys to modify. +* We would need to develop naming conventions for different routes, which may + be more complicated than the current conventions; while a GridSearchCV + wrapping a Pipeline currently takes parameters with keys like + `{step_name}__{prop_name}`, this explicit routing, and conflict with + GridSearchCV routing destinations, implies keys like + `estimator__{step_name}__{prop_name}`. + +.. literalinclude:: cases_opt2.py + + +Solution 3: Specify routes on metaestimators +-------------------------------------------- + +Each meta-estimator is given a routing specification which it must follow in +passing only the required parameters to each of its children. In this context, +a GridSearchCV has children including `estimator`, `cv` and (each element of) +`scoring`. + +Pull request :pr:`9566` and its extension in :pr:`15425` are partial +implementations of this approach. + +A major benefit of this approach is that it may allow only prop routing +meta-estimators to be modified, not prop consumers. + +All consumers would be required to check that + +Issues: + +* Routing may be hard to get one's head around, especially since the prop + support belongs to the child estimator but the parent is responsible for the + routing. +* Need to design an API for specifying routings. +* As in Solution 2, each local destination for routing props needs to be given + a name. +* Every router along the route will need consistent instructions to pass a + specific prop to a consumer. If the prop is optional in the consumer, routing + failures may be hard to identify and debug. +* For estimators to be cloned, this routing information needs to be cloned with + it. This implies one of: the routing information be stored as a constructor + paramerter; or `clone` is extended to explicitly copy routing information. + +Possible public syntax: + +Each meta-estimator has a `prop_routing` parameter to encode local routing +rules, and a set of named children which it routes to. In :pr:`9566`, the +`prop_routing` entry for each child may be a white list or black list of +named keys passed to the meta-estimator. + +.. literalinclude:: cases_opt3.py + + +Solution 4: Each child requests +------------------------------- + +Here the meta-estimator provides only what its each of its children requests. +The meta-estimator would also need to request, on behalf of its children, +any prop that descendant consumers require. + +Each object in a situation that could receive props would have a method like +`_get_prop_requests()` which would return a list of prop names (or perhaps a +mapping for more sophisticated use-cases). Group* CV splitters would default to +returning `['groups']`, for example. Estimators supporting weighted fitting +may return `[]` by default, but may have a parameter `request_props` which +may be set to `['weight']` if weight is sought, or perhaps just boolean +parameter `request_weight`. `make_scorer` would have a similar mechanism for +enabling weighted scoring. + +Advantages: + +* This will not need to affect legacy estimators, since no props will be + passed when a props request is not available. +* This does not require defining a new syntax for routing. +* The implementation changes in meta-estimators may be easy to provide via a + helper or two (perhaps even `call_with_props(method, target, props)`). +* Easy to reconfigure what props an estimator gets in a grid search. +* Could make use of existing `**fit_params` syntax rather than introducing new + `props` argument to `fit`. + +Disadvantages: + +* This will require modifying every estimator that may want props, as well as + all meta-estimators. We could provide a mixin or similar to add prop-request + support to a legacy estimator; or `BaseEstimator` could have a + `set_props_request` method (instead of the `request_props` constructor + parameter approach) such that all legacy base estimators are + automatically equipped. +* For estimators to be cloned, this request information needs to be cloned with + it. This implies one of: the request information be stored as a constructor + paramerter; or `clone` is extended to explicitly copy request information. + +Possible public syntax: + +* `BaseEstimator` will have methods `set_props_request` and `get_props_request` +* `make_scorer` will have a `request_props` parameter to set props required by + the scorer. +* `get_props_request` will return a dict. It maps the key that the user + passes to the key that the estimator expects. +* `set_props_request` will accept either such a dict or a sequence `s` to be + interpreted as the identity mapping for all elements in `s` + (`{x: x for x in s}`). It will return `self` to enable chaining. +* `Group*` CV splitters will by default request the 'groups' prop, but its + mapping can be changed with their `set_props_request` method. + +Test cases: + +.. literalinclude:: cases_opt4.py + +Naming +------ + +"Sample props" has become a name understood internally to the Scikit-learn +development team. For ongoing usage we have several choices for naming: + +* Sample meta +* Sample properties +* Sample props +* Sample extra + +Proposal +-------- + +Having considered the above solutions, we propose: + +TODO + +* which solution? +* if an estimator requests a prop, must it be not-null? Must it be provided or explicitly passed as None? +* props param or kwargs? +* naming? + +Backward compatibility +---------------------- + +TODO + +TODO: Do we continue to handle sample_weight such that it only gets provided of requested explicitly? Or do we make it requested by default in the future (possibly with a deprecation period)? + +During a deprecation period, fit_params will be handled dually: Keys that are requested will be passed through the new request mechanism, while keys that are not known will be routed using legacy mechanisms. At completion of the deprecation period, the legacy handling will cease. + +Grouped cross validation splitters will request `groups` since they were previously unusable in a nested cross validation context, so this should not often create backwards incompatibilities, except perhaps where a fit param named `groups` served another purpose. + +Discussion +---------- + +One benefit of the explicitness in Solution 4 is that even if it makes use of **kw arguments, it does not preclude keywords arguments serving other purposes in addition. That is, in addition to requesting sample props, a future proposal could allow estimators to request feature metadata or other keys. + +TODO + +References and Footnotes +------------------------ + +.. [1] Each SLEP must either be explicitly labeled as placed in the public + domain (see this SLEP as an example) or licensed under the `Open + Publication License`_. +.. _Open Publication License: https://www.opencontent.org/openpub/ + + +Copyright +--------- + +This document has been placed in the public domain. [1]_