Skip to content

Improved notes and added per class examples for accessor extensions #55442

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
310 changes: 263 additions & 47 deletions pandas/core/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
)
import warnings

from pandas.util._decorators import doc
from pandas.util._decorators import (
Appender,
doc,
)
from pandas.util._exceptions import find_stack_level


Expand Down Expand Up @@ -254,53 +257,18 @@ def _register_accessor(name: str, cls):

Notes
-----
When accessed, your accessor will be initialized with the pandas object
the user is interacting with. So the signature must be

.. code-block:: python

def __init__(self, pandas_object): # noqa: E999
...

For consistency with pandas methods, you should raise an ``AttributeError``
if the data passed to your accessor has an incorrect dtype.

>>> pd.Series(['a', 'b']).dt
Traceback (most recent call last):
...
AttributeError: Can only use .dt accessor with datetimelike values

Examples
--------
In your library code::
This function is used to register user defined Accessor classes for {klass}.
An accessor class needs to:

* Have an init method
* that accepts only a single {klass} object as an argument
* Raise an AttributeError if the {klass} object does not have correct
input for this accessor (See examples)
* Have a method for every access pattern,
* methods can take any argument signature
* if an access pattern doesn't need any additional arguments,
it can be accessed as an attribute using the @property decorator.

import pandas as pd

@pd.api.extensions.register_dataframe_accessor("geo")
class GeoAccessor:
def __init__(self, pandas_obj):
self._obj = pandas_obj

@property
def center(self):
# return the geographic center point of this DataFrame
lat = self._obj.latitude
lon = self._obj.longitude
return (float(lon.mean()), float(lat.mean()))

def plot(self):
# plot this array's data on a map, e.g., using Cartopy
pass

Back in an interactive IPython session:

.. code-block:: ipython

In [1]: ds = pd.DataFrame({{"longitude": np.linspace(0, 10),
...: "latitude": np.linspace(0, 20)}})
In [2]: ds.geo.center
Out[2]: (5.0, 10.0)
In [3]: ds.geo.plot() # plots data on a map
"""

def decorator(accessor):
Expand All @@ -319,20 +287,268 @@ def decorator(accessor):
return decorator


df_accessor_example = """
For instance, if you want your accessor to accept only integer data,
the class might look like this:

.. code-block:: python

@pd.api.extensions.register_dataframe_accessor("my_accessor")
class MyAccessor:
def __init__(self, pandas_obj):
if not all(pandas_obj[col].dtype == 'int64' for col in pandas_obj.columns):
raise AttributeError("All columns should contain only integer values.")
self._obj = pandas_obj

def sum_squared(self):
return (self._obj ** 2).sum()

@property
def total_elements(self):
return self._obj.size


>>> df = pd.DataFrame([[1, 2], ['a', 'b']]) # incorrect dtype
>>> df.my_accessor
Traceback (most recent call last):
...
AttributeError: All columns should contain only integer values.

>>> df = pd.DataFrame([[1, 2], [3, 4]])
>>> df.my_accessor.sum_squared()
0 5
1 25
dtype: int64
>>> df.my_accessor.total_elements
4


Examples
--------
In your library code::

import pandas as pd

@pd.api.extensions.register_dataframe_accessor("geo")
class GeoAccessor:
def __init__(self, pandas_obj):
if not infer_dtype(pandas_obj) == 'WHATTT':
raise Attribute_error
self._obj = pandas_obj

@property
def center(self):
# return the geographic center point of this DataFrame
lat = self._obj.latitude
lon = self._obj.longitude
return (float(lon.mean()), float(lat.mean()))

def plot(self):
# plot this array's data on a map, e.g., using Cartopy
pass

Back in an interactive IPython session:

.. code-block:: ipython

In [1]: ds = pd.DataFrame({{"longitude": np.linspace(0, 10),
...: "latitude": np.linspace(0, 20)}})
In [2]: ds.geo.center
Out[2]: (5.0, 10.0)
In [3]: ds.geo.plot() # plots data on a map"
"""


@Appender(df_accessor_example)
@doc(_register_accessor, klass="DataFrame")
def register_dataframe_accessor(name: str):
from pandas import DataFrame

return _register_accessor(name, DataFrame)


_series_docu = """


.. code-block:: python

@pd.api.extensions.register_series_accessor("my_accessor")
class MyAccessor:
def __init__(self, pandas_obj):
if not infer_dtype(pandas_obj) == 'integer':
raise AttributeError("The series must contain only integer data.")
self._obj = pandas_obj

def sum_squared(self):
return (self._obj ** 2).sum()

@property
def total_elements(self):
return self._obj.size

>>> df = pd.Series([1, 'a', 2,'b'])
>>> df.my_accessor
Traceback (most recent call last):
...
AttributeError: The series must contain only integer data.

>>> df = pd.Series([1, 2, 3])
>>> df.my_accessor.sum_squared()
14
>>> df.my_accessor.total_elements
3

Examples
--------
In your library code::

import pandas as pd
from pandas.api.types import infer_dtype
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


@pd.api.extensions.register_series_accessor("nlp")
class NLPExtension:
def __init__(self, series):
if not infer_dtype(series) == 'string':
raise Attribute_error
self._obj = series

@property
def lemma(self):
lemmatizer = WordNetLemmatizer()

def lemmatize_sentence(sentence):
words = word_tokenize(sentence)
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
return ' '.join(lemmatized_words)

return self._obj.apply(lemmatize_sentence)

Back in an interactive IPython session:

.. code-block:: ipython

In [1]: r = ['The cats are running', 'She flies kites.']
In [2]: ser = pd.Series(r, copy=False)
In [3]: ser.nlp.lemma
Out [3]:
0 The cat are running.
1 She fly kite.
dtype: object

"""


@Appender(_series_docu)
@doc(_register_accessor, klass="Series")
def register_series_accessor(name: str):
from pandas import Series

return _register_accessor(name, Series)


_index_docu = """

.. code-block:: python

@pd.api.extensions.register_index_accessor("my_accessor")
class MyAccessor:
def __init__(self, raw_index):
if not all(isinstance(x, int) for x in raw_index):
raise AttributeError("The index can only be integer.")
self._obj = raw_index

def even(self):
return

@property
def odd(self):
return

>>> df = pd.DataFrame.from_dict({
'row2': {'1':1, '2':'a'},
'row2': {'1':2, '2':'b'}
},orient='index')
>>> df.index.my_accessor
Traceback (most recent call last):
...
AttributeError: The index can only be integer.


>>> df = pd.DataFrame({
'col1': [1, 2, 3, 4],
'col2': ['a', 'b', 'c', 'd']
}, index=[1, 2, 5, 8])
>>> df.index.my_accessor.even()
[2,8]
>>> df.index.my_accessor.number_of_odd
2

Examples
--------
In your library code::

from pandas.tseries.holiday import USFederalHolidayCalendar

@pd.api.extensions.register_index_accessor("timeoff")
class TimeOffAccessor:
def __init__(self, raw_index):
self._raw = raw_index
try:
self._date=pd.to_datetime(self._raw)
except:
raise AttributeError(f"Must be able to convert"
"index {self._raw} to datetime")

min_data = self._date.min()
max_data = self._date.max()
self._holydays = (
USFederalHolidayCalendar()
.holidays(start=min_data, end=max_data)
)

@property
def weekend(self):
is_weekend = self._date.weekday.isin([5,6])
return self._raw[is_weekend]
@property
def holyday(self):
is_holiday = self._date.isin(self._holydays)
return self._raw[is_holiday]

Back in an interactive IPython session:

.. code-block:: ipython

In[1]: exercise_data = pd.DataFrame.from_dict({
'1/1/2018':{'exercise':'run', 'minutes':30},
'1/4/2018':{'exercise':'swim', 'minutes':45},
'1/5/2018':{'exercise':'run', 'minutes':30},
'1/6/2018':{'exercise':'swim', 'minutes':45}
},orient='index')

In[2]: exercise_data
Out[2]: exercise minutes
1/1/2018 run 30
1/4/2018 swim 45
1/5/2018 run 30
1/6/2018 swim 45

In[3]: exercise_data.loc[exercise_data.index.timeoff.weekend]
Out[3]: exercise minutes
1/6/2018 swim 45
1/7/2018 run 30

In[4]: exercise_data.loc[exercise_data.index.timeoff.holyday]
Out[4]: exercise minutes
1/1/2018 run 30

"""


@Appender(_index_docu)
@doc(_register_accessor, klass="Index")
def register_index_accessor(name: str):
from pandas import Index
Expand Down