diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 698abb2ec4989..1768d4a0de4c0 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -12,7 +12,10 @@ ) import warnings -from pandas.util._decorators import doc +from pandas.util._decorators import ( + Appender, + doc, +) from pandas.util._exceptions import find_stack_level @@ -254,53 +257,18 @@ def _register_accessor(name: str, cls): Notes ----- - When accessed, your accessor will be initialized with the pandas object - the user is interacting with. So the signature must be - - .. code-block:: python - - def __init__(self, pandas_object): # noqa: E999 - ... - - For consistency with pandas methods, you should raise an ``AttributeError`` - if the data passed to your accessor has an incorrect dtype. - - >>> pd.Series(['a', 'b']).dt - Traceback (most recent call last): - ... - AttributeError: Can only use .dt accessor with datetimelike values - - Examples - -------- - In your library code:: + This function is used to register user defined Accessor classes for {klass}. + An accessor class needs to: + + * Have an init method + * that accepts only a single {klass} object as an argument + * Raise an AttributeError if the {klass} object does not have correct + input for this accessor (See examples) + * Have a method for every access pattern, + * methods can take any argument signature + * if an access pattern doesn't need any additional arguments, + it can be accessed as an attribute using the @property decorator. - import pandas as pd - - @pd.api.extensions.register_dataframe_accessor("geo") - class GeoAccessor: - def __init__(self, pandas_obj): - self._obj = pandas_obj - - @property - def center(self): - # return the geographic center point of this DataFrame - lat = self._obj.latitude - lon = self._obj.longitude - return (float(lon.mean()), float(lat.mean())) - - def plot(self): - # plot this array's data on a map, e.g., using Cartopy - pass - - Back in an interactive IPython session: - - .. code-block:: ipython - - In [1]: ds = pd.DataFrame({{"longitude": np.linspace(0, 10), - ...: "latitude": np.linspace(0, 20)}}) - In [2]: ds.geo.center - Out[2]: (5.0, 10.0) - In [3]: ds.geo.plot() # plots data on a map """ def decorator(accessor): @@ -319,6 +287,79 @@ def decorator(accessor): return decorator +df_accessor_example = """ +For instance, if you want your accessor to accept only integer data, +the class might look like this: + +.. code-block:: python + + @pd.api.extensions.register_dataframe_accessor("my_accessor") + class MyAccessor: + def __init__(self, pandas_obj): + if not all(pandas_obj[col].dtype == 'int64' for col in pandas_obj.columns): + raise AttributeError("All columns should contain only integer values.") + self._obj = pandas_obj + + def sum_squared(self): + return (self._obj ** 2).sum() + + @property + def total_elements(self): + return self._obj.size + + +>>> df = pd.DataFrame([[1, 2], ['a', 'b']]) # incorrect dtype +>>> df.my_accessor +Traceback (most recent call last): +... +AttributeError: All columns should contain only integer values. + +>>> df = pd.DataFrame([[1, 2], [3, 4]]) +>>> df.my_accessor.sum_squared() +0 5 +1 25 +dtype: int64 +>>> df.my_accessor.total_elements +4 + + +Examples +-------- +In your library code:: + + import pandas as pd + + @pd.api.extensions.register_dataframe_accessor("geo") + class GeoAccessor: + def __init__(self, pandas_obj): + if not infer_dtype(pandas_obj) == 'WHATTT': + raise Attribute_error + self._obj = pandas_obj + + @property + def center(self): + # return the geographic center point of this DataFrame + lat = self._obj.latitude + lon = self._obj.longitude + return (float(lon.mean()), float(lat.mean())) + + def plot(self): + # plot this array's data on a map, e.g., using Cartopy + pass + +Back in an interactive IPython session: + +.. code-block:: ipython + + In [1]: ds = pd.DataFrame({{"longitude": np.linspace(0, 10), + ...: "latitude": np.linspace(0, 20)}}) + In [2]: ds.geo.center + Out[2]: (5.0, 10.0) + In [3]: ds.geo.plot() # plots data on a map" +""" + + +@Appender(df_accessor_example) @doc(_register_accessor, klass="DataFrame") def register_dataframe_accessor(name: str): from pandas import DataFrame @@ -326,6 +367,81 @@ def register_dataframe_accessor(name: str): return _register_accessor(name, DataFrame) +_series_docu = """ + + +.. code-block:: python + + @pd.api.extensions.register_series_accessor("my_accessor") + class MyAccessor: + def __init__(self, pandas_obj): + if not infer_dtype(pandas_obj) == 'integer': + raise AttributeError("The series must contain only integer data.") + self._obj = pandas_obj + + def sum_squared(self): + return (self._obj ** 2).sum() + + @property + def total_elements(self): + return self._obj.size + +>>> df = pd.Series([1, 'a', 2,'b']) +>>> df.my_accessor +Traceback (most recent call last): +... +AttributeError: The series must contain only integer data. + +>>> df = pd.Series([1, 2, 3]) +>>> df.my_accessor.sum_squared() +14 +>>> df.my_accessor.total_elements +3 + +Examples +-------- +In your library code:: + + import pandas as pd + from pandas.api.types import infer_dtype + from nltk.stem import WordNetLemmatizer + from nltk.tokenize import word_tokenize + + + @pd.api.extensions.register_series_accessor("nlp") + class NLPExtension: + def __init__(self, series): + if not infer_dtype(series) == 'string': + raise Attribute_error + self._obj = series + + @property + def lemma(self): + lemmatizer = WordNetLemmatizer() + + def lemmatize_sentence(sentence): + words = word_tokenize(sentence) + lemmatized_words = [lemmatizer.lemmatize(word) for word in words] + return ' '.join(lemmatized_words) + + return self._obj.apply(lemmatize_sentence) + +Back in an interactive IPython session: + +.. code-block:: ipython + + In [1]: r = ['The cats are running', 'She flies kites.'] + In [2]: ser = pd.Series(r, copy=False) + In [3]: ser.nlp.lemma + Out [3]: + 0 The cat are running. + 1 She fly kite. + dtype: object + +""" + + +@Appender(_series_docu) @doc(_register_accessor, klass="Series") def register_series_accessor(name: str): from pandas import Series @@ -333,6 +449,106 @@ def register_series_accessor(name: str): return _register_accessor(name, Series) +_index_docu = """ + +.. code-block:: python + + @pd.api.extensions.register_index_accessor("my_accessor") + class MyAccessor: + def __init__(self, raw_index): + if not all(isinstance(x, int) for x in raw_index): + raise AttributeError("The index can only be integer.") + self._obj = raw_index + + def even(self): + return + + @property + def odd(self): + return + +>>> df = pd.DataFrame.from_dict({ + 'row2': {'1':1, '2':'a'}, + 'row2': {'1':2, '2':'b'} + },orient='index') +>>> df.index.my_accessor +Traceback (most recent call last): +... +AttributeError: The index can only be integer. + + +>>> df = pd.DataFrame({ + 'col1': [1, 2, 3, 4], + 'col2': ['a', 'b', 'c', 'd'] + }, index=[1, 2, 5, 8]) +>>> df.index.my_accessor.even() +[2,8] +>>> df.index.my_accessor.number_of_odd +2 + +Examples +-------- +In your library code:: + + from pandas.tseries.holiday import USFederalHolidayCalendar + + @pd.api.extensions.register_index_accessor("timeoff") + class TimeOffAccessor: + def __init__(self, raw_index): + self._raw = raw_index + try: + self._date=pd.to_datetime(self._raw) + except: + raise AttributeError(f"Must be able to convert" + "index {self._raw} to datetime") + + min_data = self._date.min() + max_data = self._date.max() + self._holydays = ( + USFederalHolidayCalendar() + .holidays(start=min_data, end=max_data) + ) + + @property + def weekend(self): + is_weekend = self._date.weekday.isin([5,6]) + return self._raw[is_weekend] + @property + def holyday(self): + is_holiday = self._date.isin(self._holydays) + return self._raw[is_holiday] + +Back in an interactive IPython session: + +.. code-block:: ipython + + In[1]: exercise_data = pd.DataFrame.from_dict({ + '1/1/2018':{'exercise':'run', 'minutes':30}, + '1/4/2018':{'exercise':'swim', 'minutes':45}, + '1/5/2018':{'exercise':'run', 'minutes':30}, + '1/6/2018':{'exercise':'swim', 'minutes':45} + },orient='index') + + In[2]: exercise_data + Out[2]: exercise minutes + 1/1/2018 run 30 + 1/4/2018 swim 45 + 1/5/2018 run 30 + 1/6/2018 swim 45 + + In[3]: exercise_data.loc[exercise_data.index.timeoff.weekend] + Out[3]: exercise minutes + 1/6/2018 swim 45 + 1/7/2018 run 30 + + In[4]: exercise_data.loc[exercise_data.index.timeoff.holyday] + Out[4]: exercise minutes + 1/1/2018 run 30 + +""" + + +@Appender(_index_docu) @doc(_register_accessor, klass="Index") def register_index_accessor(name: str): from pandas import Index