diff --git a/doc/source/development/code_style.rst b/doc/source/development/code_style.rst index 11d0c35f92ff5..387f65ea583a0 100644 --- a/doc/source/development/code_style.rst +++ b/doc/source/development/code_style.rst @@ -172,5 +172,6 @@ Reading from a url .. code-block:: python from pandas.io.common import urlopen - with urlopen('http://www.google.com') as url: + + with urlopen("http://www.google.com") as url: raw_text = url.read() diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst index fbd83af3de82e..bdbcf5ca337b8 100644 --- a/doc/source/development/developer.rst +++ b/doc/source/development/developer.rst @@ -71,11 +71,13 @@ descriptor format for these as is follows: .. code-block:: python index = pd.RangeIndex(0, 10, 2) - {'kind': 'range', - 'name': index.name, - 'start': index.start, - 'stop': index.stop, - 'step': index.step} + { + "kind": "range", + "name": index.name, + "start": index.start, + "stop": index.stop, + "step": index.step, + } Other index types must be serialized as data columns along with the other DataFrame columns. The metadata for these is a string indicating the name of diff --git a/doc/source/development/internals.rst b/doc/source/development/internals.rst index 8f1c3d5d818c2..cec385dd087db 100644 --- a/doc/source/development/internals.rst +++ b/doc/source/development/internals.rst @@ -68,8 +68,9 @@ integer **codes** (until version 0.24 named *labels*), and the level **names**: .. ipython:: python - index = pd.MultiIndex.from_product([range(3), ['one', 'two']], - names=['first', 'second']) + index = pd.MultiIndex.from_product( + [range(3), ["one", "two"]], names=["first", "second"] + ) index index.levels index.codes diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 9da5d2a9fc92f..926c2d9be74c2 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -58,7 +58,7 @@ By converting an existing ``Series`` or column to a ``category`` dtype: .. ipython:: python df = pd.DataFrame({"A": ["a", "b", "c", "a"]}) - df["B"] = df["A"].astype('category') + df["B"] = df["A"].astype("category") df By using special functions, such as :func:`~pandas.cut`, which groups data into @@ -66,18 +66,19 @@ discrete bins. See the :ref:`example on tiling ` in the docs .. ipython:: python - df = pd.DataFrame({'value': np.random.randint(0, 100, 20)}) + df = pd.DataFrame({"value": np.random.randint(0, 100, 20)}) labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)] - df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels) + df["group"] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels) df.head(10) By passing a :class:`pandas.Categorical` object to a ``Series`` or assigning it to a ``DataFrame``. .. ipython:: python - raw_cat = pd.Categorical(["a", "b", "c", "a"], categories=["b", "c", "d"], - ordered=False) + raw_cat = pd.Categorical( + ["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=False + ) s = pd.Series(raw_cat) s df = pd.DataFrame({"A": ["a", "b", "c", "a"]}) @@ -100,7 +101,7 @@ This can be done during construction by specifying ``dtype="category"`` in the ` .. ipython:: python - df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}, dtype="category") + df = pd.DataFrame({"A": list("abca"), "B": list("bccd")}, dtype="category") df.dtypes Note that the categories present in each column differ; the conversion is done column by column, so @@ -108,24 +109,24 @@ only labels present in a given column are categories: .. ipython:: python - df['A'] - df['B'] + df["A"] + df["B"] Analogously, all columns in an existing ``DataFrame`` can be batch converted using :meth:`DataFrame.astype`: .. ipython:: python - df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}) - df_cat = df.astype('category') + df = pd.DataFrame({"A": list("abca"), "B": list("bccd")}) + df_cat = df.astype("category") df_cat.dtypes This conversion is likewise done column by column: .. ipython:: python - df_cat['A'] - df_cat['B'] + df_cat["A"] + df_cat["B"] Controlling behavior @@ -143,9 +144,9 @@ of :class:`~pandas.api.types.CategoricalDtype`. .. ipython:: python from pandas.api.types import CategoricalDtype + s = pd.Series(["a", "b", "c", "a"]) - cat_type = CategoricalDtype(categories=["b", "c", "d"], - ordered=True) + cat_type = CategoricalDtype(categories=["b", "c", "d"], ordered=True) s_cat = s.astype(cat_type) s_cat @@ -155,12 +156,12 @@ are consistent among all columns. .. ipython:: python from pandas.api.types import CategoricalDtype - df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}) - cat_type = CategoricalDtype(categories=list('abcd'), - ordered=True) + + df = pd.DataFrame({"A": list("abca"), "B": list("bccd")}) + cat_type = CategoricalDtype(categories=list("abcd"), ordered=True) df_cat = df.astype(cat_type) - df_cat['A'] - df_cat['B'] + df_cat["A"] + df_cat["B"] .. note:: @@ -175,8 +176,7 @@ during normal constructor mode: .. ipython:: python splitter = np.random.choice([0, 1], 5, p=[0.5, 0.5]) - s = pd.Series(pd.Categorical.from_codes(splitter, - categories=["train", "test"])) + s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"])) Regaining original data @@ -189,7 +189,7 @@ To get back to the original ``Series`` or NumPy array, use s = pd.Series(["a", "b", "c", "a"]) s - s2 = s.astype('category') + s2 = s.astype("category") s2 s2.astype(str) np.asarray(s2) @@ -223,8 +223,9 @@ by default. .. ipython:: python from pandas.api.types import CategoricalDtype - CategoricalDtype(['a', 'b', 'c']) - CategoricalDtype(['a', 'b', 'c'], ordered=True) + + CategoricalDtype(["a", "b", "c"]) + CategoricalDtype(["a", "b", "c"], ordered=True) CategoricalDtype() A :class:`~pandas.api.types.CategoricalDtype` can be used in any place pandas @@ -248,19 +249,19 @@ unordered categoricals, the order of the ``categories`` is not considered. .. ipython:: python - c1 = CategoricalDtype(['a', 'b', 'c'], ordered=False) + c1 = CategoricalDtype(["a", "b", "c"], ordered=False) # Equal, since order is not considered when ordered=False - c1 == CategoricalDtype(['b', 'c', 'a'], ordered=False) + c1 == CategoricalDtype(["b", "c", "a"], ordered=False) # Unequal, since the second CategoricalDtype is ordered - c1 == CategoricalDtype(['a', 'b', 'c'], ordered=True) + c1 == CategoricalDtype(["a", "b", "c"], ordered=True) All instances of ``CategoricalDtype`` compare equal to the string ``'category'``. .. ipython:: python - c1 == 'category' + c1 == "category" .. warning:: @@ -303,8 +304,7 @@ It's also possible to pass in the categories in a specific order: .. ipython:: python - s = pd.Series(pd.Categorical(["a", "b", "c", "a"], - categories=["c", "b", "a"])) + s = pd.Series(pd.Categorical(["a", "b", "c", "a"], categories=["c", "b", "a"])) s.cat.categories s.cat.ordered @@ -322,7 +322,7 @@ It's also possible to pass in the categories in a specific order: .. ipython:: python - s = pd.Series(list('babc')).astype(CategoricalDtype(list('abcd'))) + s = pd.Series(list("babc")).astype(CategoricalDtype(list("abcd"))) s # categories @@ -348,7 +348,7 @@ Renaming categories is done by assigning new values to the s = s.cat.rename_categories([1, 2, 3]) s # You can also pass a dict-like object to map the renaming - s = s.cat.rename_categories({1: 'x', 2: 'y', 3: 'z'}) + s = s.cat.rename_categories({1: "x", 2: "y", 3: "z"}) s .. note:: @@ -409,8 +409,7 @@ Removing unused categories can also be done: .. ipython:: python - s = pd.Series(pd.Categorical(["a", "b", "a"], - categories=["a", "b", "c", "d"])) + s = pd.Series(pd.Categorical(["a", "b", "a"], categories=["a", "b", "c", "d"])) s s.cat.remove_unused_categories() @@ -446,9 +445,7 @@ meaning and certain operations are possible. If the categorical is unordered, `` s = pd.Series(pd.Categorical(["a", "b", "c", "a"], ordered=False)) s.sort_values(inplace=True) - s = pd.Series(["a", "b", "c", "a"]).astype( - CategoricalDtype(ordered=True) - ) + s = pd.Series(["a", "b", "c", "a"]).astype(CategoricalDtype(ordered=True)) s.sort_values(inplace=True) s s.min(), s.max() @@ -514,18 +511,20 @@ The ordering of the categorical is determined by the ``categories`` of that colu .. ipython:: python - dfs = pd.DataFrame({'A': pd.Categorical(list('bbeebbaa'), - categories=['e', 'a', 'b'], - ordered=True), - 'B': [1, 2, 1, 2, 2, 1, 2, 1]}) - dfs.sort_values(by=['A', 'B']) + dfs = pd.DataFrame( + { + "A": pd.Categorical(list("bbeebbaa"), categories=["e", "a", "b"], ordered=True), + "B": [1, 2, 1, 2, 2, 1, 2, 1], + } + ) + dfs.sort_values(by=["A", "B"]) Reordering the ``categories`` changes a future sort. .. ipython:: python - dfs['A'] = dfs['A'].cat.reorder_categories(['a', 'b', 'e']) - dfs.sort_values(by=['A', 'B']) + dfs["A"] = dfs["A"].cat.reorder_categories(["a", "b", "e"]) + dfs.sort_values(by=["A", "B"]) Comparisons ----------- @@ -550,15 +549,9 @@ categories or a categorical with any list-like object, will raise a ``TypeError` .. ipython:: python - cat = pd.Series([1, 2, 3]).astype( - CategoricalDtype([3, 2, 1], ordered=True) - ) - cat_base = pd.Series([2, 2, 2]).astype( - CategoricalDtype([3, 2, 1], ordered=True) - ) - cat_base2 = pd.Series([2, 2, 2]).astype( - CategoricalDtype(ordered=True) - ) + cat = pd.Series([1, 2, 3]).astype(CategoricalDtype([3, 2, 1], ordered=True)) + cat_base = pd.Series([2, 2, 2]).astype(CategoricalDtype([3, 2, 1], ordered=True)) + cat_base2 = pd.Series([2, 2, 2]).astype(CategoricalDtype(ordered=True)) cat cat_base @@ -607,8 +600,8 @@ When you compare two unordered categoricals with the same categories, the order .. ipython:: python - c1 = pd.Categorical(['a', 'b'], categories=['a', 'b'], ordered=False) - c2 = pd.Categorical(['a', 'b'], categories=['b', 'a'], ordered=False) + c1 = pd.Categorical(["a", "b"], categories=["a", "b"], ordered=False) + c2 = pd.Categorical(["a", "b"], categories=["b", "a"], ordered=False) c1 == c2 Operations @@ -622,23 +615,21 @@ even if some categories are not present in the data: .. ipython:: python - s = pd.Series(pd.Categorical(["a", "b", "c", "c"], - categories=["c", "a", "b", "d"])) + s = pd.Series(pd.Categorical(["a", "b", "c", "c"], categories=["c", "a", "b", "d"])) s.value_counts() Groupby will also show "unused" categories: .. ipython:: python - cats = pd.Categorical(["a", "b", "b", "b", "c", "c", "c"], - categories=["a", "b", "c", "d"]) + cats = pd.Categorical( + ["a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"] + ) df = pd.DataFrame({"cats": cats, "values": [1, 2, 2, 2, 3, 4, 5]}) df.groupby("cats").mean() cats2 = pd.Categorical(["a", "a", "b", "b"], categories=["a", "b", "c"]) - df2 = pd.DataFrame({"cats": cats2, - "B": ["c", "d", "c", "d"], - "values": [1, 2, 3, 4]}) + df2 = pd.DataFrame({"cats": cats2, "B": ["c", "d", "c", "d"], "values": [1, 2, 3, 4]}) df2.groupby(["cats", "B"]).mean() @@ -647,10 +638,8 @@ Pivot tables: .. ipython:: python raw_cat = pd.Categorical(["a", "a", "b", "b"], categories=["a", "b", "c"]) - df = pd.DataFrame({"A": raw_cat, - "B": ["c", "d", "c", "d"], - "values": [1, 2, 3, 4]}) - pd.pivot_table(df, values='values', index=['A', 'B']) + df = pd.DataFrame({"A": raw_cat, "B": ["c", "d", "c", "d"], "values": [1, 2, 3, 4]}) + pd.pivot_table(df, values="values", index=["A", "B"]) Data munging ------------ @@ -668,8 +657,7 @@ If the slicing operation returns either a ``DataFrame`` or a column of type .. ipython:: python idx = pd.Index(["h", "i", "j", "k", "l", "m", "n"]) - cats = pd.Series(["a", "b", "b", "b", "c", "c", "c"], - dtype="category", index=idx) + cats = pd.Series(["a", "b", "b", "b", "c", "c", "c"], dtype="category", index=idx) values = [1, 2, 2, 2, 3, 4, 5] df = pd.DataFrame({"cats": cats, "values": values}, index=idx) df.iloc[2:4, :] @@ -714,13 +702,13 @@ an appropriate type: .. ipython:: python - str_s = pd.Series(list('aabb')) - str_cat = str_s.astype('category') + str_s = pd.Series(list("aabb")) + str_cat = str_s.astype("category") str_cat str_cat.str.contains("a") - date_s = pd.Series(pd.date_range('1/1/2015', periods=5)) - date_cat = date_s.astype('category') + date_s = pd.Series(pd.date_range("1/1/2015", periods=5)) + date_cat = date_s.astype("category") date_cat date_cat.dt.day @@ -758,8 +746,7 @@ value is included in the ``categories``: .. ipython:: python idx = pd.Index(["h", "i", "j", "k", "l", "m", "n"]) - cats = pd.Categorical(["a", "a", "a", "a", "a", "a", "a"], - categories=["a", "b"]) + cats = pd.Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) values = [1, 1, 1, 1, 1, 1, 1] df = pd.DataFrame({"cats": cats, "values": values}, index=idx) @@ -777,8 +764,7 @@ Setting values by assigning categorical data will also check that the ``categori df.loc["j":"k", "cats"] = pd.Categorical(["a", "a"], categories=["a", "b"]) df try: - df.loc["j":"k", "cats"] = pd.Categorical(["b", "b"], - categories=["a", "b", "c"]) + df.loc["j":"k", "cats"] = pd.Categorical(["b", "b"], categories=["a", "b", "c"]) except ValueError as e: print("ValueError:", str(e)) @@ -809,12 +795,12 @@ dtypes will likely have higher memory usage. Use ``.astype`` or from pandas.api.types import union_categoricals # same categories - s1 = pd.Series(['a', 'b'], dtype='category') - s2 = pd.Series(['a', 'b', 'a'], dtype='category') + s1 = pd.Series(["a", "b"], dtype="category") + s2 = pd.Series(["a", "b", "a"], dtype="category") pd.concat([s1, s2]) # different categories - s3 = pd.Series(['b', 'c'], dtype='category') + s3 = pd.Series(["b", "c"], dtype="category") pd.concat([s1, s3]) # Output dtype is inferred based on categories values @@ -822,7 +808,7 @@ dtypes will likely have higher memory usage. Use ``.astype`` or float_cats = pd.Series([3.0, 4.0], dtype="category") pd.concat([int_cats, float_cats]) - pd.concat([s1, s3]).astype('category') + pd.concat([s1, s3]).astype("category") union_categoricals([s1.array, s3.array]) The following table summarizes the results of merging ``Categoricals``: @@ -853,6 +839,7 @@ the categories being combined. .. ipython:: python from pandas.api.types import union_categoricals + a = pd.Categorical(["b", "c"]) b = pd.Categorical(["a", "b"]) union_categoricals([a, b]) @@ -900,8 +887,8 @@ the resulting array will always be a plain ``Categorical``: .. ipython:: python - a = pd.Series(["b", "c"], dtype='category') - b = pd.Series(["a", "b"], dtype='category') + a = pd.Series(["b", "c"], dtype="category") + b = pd.Series(["a", "b"], dtype="category") union_categoricals([a, b]) .. note:: @@ -946,7 +933,8 @@ relevant columns back to ``category`` and assign the right categories and catego .. ipython:: python import io - s = pd.Series(pd.Categorical(['a', 'b', 'b', 'a', 'a', 'd'])) + + s = pd.Series(pd.Categorical(["a", "b", "b", "a", "a", "d"])) # rename the categories s.cat.categories = ["very good", "good", "bad"] # reorder the categories and add missing categories @@ -959,9 +947,9 @@ relevant columns back to ``category`` and assign the right categories and catego df2["cats"] # Redo the category df2["cats"] = df2["cats"].astype("category") - df2["cats"].cat.set_categories(["very bad", "bad", "medium", - "good", "very good"], - inplace=True) + df2["cats"].cat.set_categories( + ["very bad", "bad", "medium", "good", "very good"], inplace=True + ) df2.dtypes df2["cats"] @@ -1029,13 +1017,13 @@ an ``object`` dtype is a constant times the length of the data. .. ipython:: python - s = pd.Series(['foo', 'bar'] * 1000) + s = pd.Series(["foo", "bar"] * 1000) # object dtype s.nbytes # category dtype - s.astype('category').nbytes + s.astype("category").nbytes .. note:: @@ -1044,13 +1032,13 @@ an ``object`` dtype is a constant times the length of the data. .. ipython:: python - s = pd.Series(['foo%04d' % i for i in range(2000)]) + s = pd.Series(["foo%04d" % i for i in range(2000)]) # object dtype s.nbytes # category dtype - s.astype('category').nbytes + s.astype("category").nbytes ``Categorical`` is not a ``numpy`` array @@ -1085,8 +1073,8 @@ To check if a Series contains Categorical data, use ``hasattr(s, 'cat')``: .. ipython:: python - hasattr(pd.Series(['a'], dtype='category'), 'cat') - hasattr(pd.Series(['a']), 'cat') + hasattr(pd.Series(["a"], dtype="category"), "cat") + hasattr(pd.Series(["a"]), "cat") Using NumPy functions on a ``Series`` of type ``category`` should not work as ``Categoricals`` are not numeric data (even in the case that ``.categories`` is numeric). @@ -1113,9 +1101,9 @@ You can use ``fillna`` to handle missing values before applying a function. .. ipython:: python - df = pd.DataFrame({"a": [1, 2, 3, 4], - "b": ["a", "b", "c", "d"], - "cats": pd.Categorical([1, 2, 3, 2])}) + df = pd.DataFrame( + {"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"], "cats": pd.Categorical([1, 2, 3, 2])} + ) df.apply(lambda row: type(row["cats"]), axis=1) df.apply(lambda col: col.dtype, axis=0) diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index a45d7a4fa1547..acee1638570f7 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -112,7 +112,7 @@ dtype if needed. s.iloc[1:3] # operate with other dtypes - s + s.iloc[1:3].astype('Int8') + s + s.iloc[1:3].astype("Int8") # coerce when needed s + 0.01 @@ -121,7 +121,7 @@ These dtypes can operate as part of of ``DataFrame``. .. ipython:: python - df = pd.DataFrame({'A': s, 'B': [1, 1, 3], 'C': list('aab')}) + df = pd.DataFrame({"A": s, "B": [1, 1, 3], "C": list("aab")}) df df.dtypes @@ -130,15 +130,15 @@ These dtypes can be merged & reshaped & casted. .. ipython:: python - pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes - df['A'].astype(float) + pd.concat([df[["A"]], df[["B", "C"]]], axis=1).dtypes + df["A"].astype(float) Reduction and groupby operations such as 'sum' work as well. .. ipython:: python df.sum() - df.groupby('B').A.sum() + df.groupby("B").A.sum() Scalar NA Value --------------- diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst index 563fc941294d1..d222297abc70b 100644 --- a/doc/source/user_guide/options.rst +++ b/doc/source/user_guide/options.rst @@ -17,6 +17,7 @@ You can get/set options directly as attributes of the top-level ``options`` attr .. ipython:: python import pandas as pd + pd.options.display.max_rows pd.options.display.max_rows = 999 pd.options.display.max_rows @@ -77,9 +78,9 @@ are available from the pandas namespace. To change an option, call .. ipython:: python - pd.get_option('mode.sim_interactive') - pd.set_option('mode.sim_interactive', True) - pd.get_option('mode.sim_interactive') + pd.get_option("mode.sim_interactive") + pd.set_option("mode.sim_interactive", True) + pd.get_option("mode.sim_interactive") **Note:** The option 'mode.sim_interactive' is mostly used for debugging purposes. @@ -135,8 +136,9 @@ More information can be found in the `ipython documentation .. code-block:: python import pandas as pd - pd.set_option('display.max_rows', 999) - pd.set_option('precision', 5) + + pd.set_option("display.max_rows", 999) + pd.set_option("precision", 5) .. _options.frequently_used: @@ -151,27 +153,27 @@ lines are replaced by an ellipsis. .. ipython:: python df = pd.DataFrame(np.random.randn(7, 2)) - pd.set_option('max_rows', 7) + pd.set_option("max_rows", 7) df - pd.set_option('max_rows', 5) + pd.set_option("max_rows", 5) df - pd.reset_option('max_rows') + pd.reset_option("max_rows") Once the ``display.max_rows`` is exceeded, the ``display.min_rows`` options determines how many rows are shown in the truncated repr. .. ipython:: python - pd.set_option('max_rows', 8) - pd.set_option('min_rows', 4) + pd.set_option("max_rows", 8) + pd.set_option("min_rows", 4) # below max_rows -> all rows shown df = pd.DataFrame(np.random.randn(7, 2)) df # above max_rows -> only min_rows (4) rows shown df = pd.DataFrame(np.random.randn(9, 2)) df - pd.reset_option('max_rows') - pd.reset_option('min_rows') + pd.reset_option("max_rows") + pd.reset_option("min_rows") ``display.expand_frame_repr`` allows for the representation of dataframes to stretch across pages, wrapped over the full column vs row-wise. @@ -179,11 +181,11 @@ dataframes to stretch across pages, wrapped over the full column vs row-wise. .. ipython:: python df = pd.DataFrame(np.random.randn(5, 10)) - pd.set_option('expand_frame_repr', True) + pd.set_option("expand_frame_repr", True) df - pd.set_option('expand_frame_repr', False) + pd.set_option("expand_frame_repr", False) df - pd.reset_option('expand_frame_repr') + pd.reset_option("expand_frame_repr") ``display.large_repr`` lets you select whether to display dataframes that exceed ``max_columns`` or ``max_rows`` as a truncated frame, or as a summary. @@ -191,26 +193,32 @@ dataframes to stretch across pages, wrapped over the full column vs row-wise. .. ipython:: python df = pd.DataFrame(np.random.randn(10, 10)) - pd.set_option('max_rows', 5) - pd.set_option('large_repr', 'truncate') + pd.set_option("max_rows", 5) + pd.set_option("large_repr", "truncate") df - pd.set_option('large_repr', 'info') + pd.set_option("large_repr", "info") df - pd.reset_option('large_repr') - pd.reset_option('max_rows') + pd.reset_option("large_repr") + pd.reset_option("max_rows") ``display.max_colwidth`` sets the maximum width of columns. Cells of this length or longer will be truncated with an ellipsis. .. ipython:: python - df = pd.DataFrame(np.array([['foo', 'bar', 'bim', 'uncomfortably long string'], - ['horse', 'cow', 'banana', 'apple']])) - pd.set_option('max_colwidth', 40) + df = pd.DataFrame( + np.array( + [ + ["foo", "bar", "bim", "uncomfortably long string"], + ["horse", "cow", "banana", "apple"], + ] + ) + ) + pd.set_option("max_colwidth", 40) df - pd.set_option('max_colwidth', 6) + pd.set_option("max_colwidth", 6) df - pd.reset_option('max_colwidth') + pd.reset_option("max_colwidth") ``display.max_info_columns`` sets a threshold for when by-column info will be given. @@ -218,11 +226,11 @@ will be given. .. ipython:: python df = pd.DataFrame(np.random.randn(10, 10)) - pd.set_option('max_info_columns', 11) + pd.set_option("max_info_columns", 11) df.info() - pd.set_option('max_info_columns', 5) + pd.set_option("max_info_columns", 5) df.info() - pd.reset_option('max_info_columns') + pd.reset_option("max_info_columns") ``display.max_info_rows``: ``df.info()`` will usually show null-counts for each column. For large frames this can be quite slow. ``max_info_rows`` and ``max_info_cols`` @@ -233,11 +241,11 @@ can specify the option ``df.info(null_counts=True)`` to override on showing a pa df = pd.DataFrame(np.random.choice([0, 1, np.nan], size=(10, 10))) df - pd.set_option('max_info_rows', 11) + pd.set_option("max_info_rows", 11) df.info() - pd.set_option('max_info_rows', 5) + pd.set_option("max_info_rows", 5) df.info() - pd.reset_option('max_info_rows') + pd.reset_option("max_info_rows") ``display.precision`` sets the output display precision in terms of decimal places. This is only a suggestion. @@ -245,9 +253,9 @@ This is only a suggestion. .. ipython:: python df = pd.DataFrame(np.random.randn(5, 5)) - pd.set_option('precision', 7) + pd.set_option("precision", 7) df - pd.set_option('precision', 4) + pd.set_option("precision", 4) df ``display.chop_threshold`` sets at what level pandas rounds to zero when @@ -257,26 +265,27 @@ precision at which the number is stored. .. ipython:: python df = pd.DataFrame(np.random.randn(6, 6)) - pd.set_option('chop_threshold', 0) + pd.set_option("chop_threshold", 0) df - pd.set_option('chop_threshold', .5) + pd.set_option("chop_threshold", 0.5) df - pd.reset_option('chop_threshold') + pd.reset_option("chop_threshold") ``display.colheader_justify`` controls the justification of the headers. The options are 'right', and 'left'. .. ipython:: python - df = pd.DataFrame(np.array([np.random.randn(6), - np.random.randint(1, 9, 6) * .1, - np.zeros(6)]).T, - columns=['A', 'B', 'C'], dtype='float') - pd.set_option('colheader_justify', 'right') + df = pd.DataFrame( + np.array([np.random.randn(6), np.random.randint(1, 9, 6) * 0.1, np.zeros(6)]).T, + columns=["A", "B", "C"], + dtype="float", + ) + pd.set_option("colheader_justify", "right") df - pd.set_option('colheader_justify', 'left') + pd.set_option("colheader_justify", "left") df - pd.reset_option('colheader_justify') + pd.reset_option("colheader_justify") @@ -481,9 +490,9 @@ For instance: import numpy as np pd.set_eng_float_format(accuracy=3, use_eng_prefix=True) - s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) - s / 1.e3 - s / 1.e6 + s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"]) + s / 1.0e3 + s / 1.0e6 .. ipython:: python :suppress: @@ -510,7 +519,7 @@ If a DataFrame or Series contains these characters, the default output mode may .. ipython:: python - df = pd.DataFrame({'国籍': ['UK', '日本'], '名前': ['Alice', 'しのぶ']}) + df = pd.DataFrame({"国籍": ["UK", "日本"], "名前": ["Alice", "しのぶ"]}) df .. image:: ../_static/option_unicode01.png @@ -521,7 +530,7 @@ times than the standard ``len`` function. .. ipython:: python - pd.set_option('display.unicode.east_asian_width', True) + pd.set_option("display.unicode.east_asian_width", True) df .. image:: ../_static/option_unicode02.png @@ -533,7 +542,7 @@ By default, an "Ambiguous" character's width, such as "¡" (inverted exclamation .. ipython:: python - df = pd.DataFrame({'a': ['xxx', '¡¡'], 'b': ['yyy', '¡¡']}) + df = pd.DataFrame({"a": ["xxx", "¡¡"], "b": ["yyy", "¡¡"]}) df .. image:: ../_static/option_unicode03.png @@ -545,7 +554,7 @@ However, setting this option incorrectly for your terminal will cause these char .. ipython:: python - pd.set_option('display.unicode.ambiguous_as_wide', True) + pd.set_option("display.unicode.ambiguous_as_wide", True) df .. image:: ../_static/option_unicode04.png @@ -553,8 +562,8 @@ However, setting this option incorrectly for your terminal will cause these char .. ipython:: python :suppress: - pd.set_option('display.unicode.east_asian_width', False) - pd.set_option('display.unicode.ambiguous_as_wide', False) + pd.set_option("display.unicode.east_asian_width", False) + pd.set_option("display.unicode.ambiguous_as_wide", False) .. _options.table_schema: @@ -567,7 +576,7 @@ by default. False by default, this can be enabled globally with the .. ipython:: python - pd.set_option('display.html.table_schema', True) + pd.set_option("display.html.table_schema", True) Only ``'display.max_rows'`` are serialized and published. @@ -575,4 +584,4 @@ Only ``'display.max_rows'`` are serialized and published. .. ipython:: python :suppress: - pd.reset_option('display.html.table_schema') + pd.reset_option("display.html.table_schema") diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index e6797512ce3cf..2061185b25416 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -18,14 +18,18 @@ Reshaping by pivoting DataFrame objects import pandas._testing as tm + def unpivot(frame): N, K = frame.shape - data = {'value': frame.to_numpy().ravel('F'), - 'variable': np.asarray(frame.columns).repeat(N), - 'date': np.tile(np.asarray(frame.index), K)} - columns = ['date', 'variable', 'value'] + data = { + "value": frame.to_numpy().ravel("F"), + "variable": np.asarray(frame.columns).repeat(N), + "date": np.tile(np.asarray(frame.index), K), + } + columns = ["date", "variable", "value"] return pd.DataFrame(data, columns=columns) + df = unpivot(tm.makeTimeDataFrame(3)) Data is often stored in so-called "stacked" or "record" format: @@ -41,12 +45,15 @@ For the curious here is how the above ``DataFrame`` was created: import pandas._testing as tm + def unpivot(frame): N, K = frame.shape - data = {'value': frame.to_numpy().ravel('F'), - 'variable': np.asarray(frame.columns).repeat(N), - 'date': np.tile(np.asarray(frame.index), K)} - return pd.DataFrame(data, columns=['date', 'variable', 'value']) + data = { + "value": frame.to_numpy().ravel("F"), + "variable": np.asarray(frame.columns).repeat(N), + "date": np.tile(np.asarray(frame.index), K), + } + return pd.DataFrame(data, columns=["date", "variable", "value"]) df = unpivot(tm.makeTimeDataFrame(3)) @@ -55,7 +62,7 @@ To select out everything for variable ``A`` we could do: .. ipython:: python - df[df['variable'] == 'A'] + df[df["variable"] == "A"] But suppose we wish to do time series operations with the variables. A better representation would be where the ``columns`` are the unique variables and an @@ -65,7 +72,7 @@ top level function :func:`~pandas.pivot`): .. ipython:: python - df.pivot(index='date', columns='variable', values='value') + df.pivot(index="date", columns="variable", values="value") If the ``values`` argument is omitted, and the input ``DataFrame`` has more than one column of values which are not used as column or index inputs to ``pivot``, @@ -75,15 +82,15 @@ column: .. ipython:: python - df['value2'] = df['value'] * 2 - pivoted = df.pivot(index='date', columns='variable') + df["value2"] = df["value"] * 2 + pivoted = df.pivot(index="date", columns="variable") pivoted You can then select subsets from the pivoted ``DataFrame``: .. ipython:: python - pivoted['value2'] + pivoted["value2"] Note that this returns a view on the underlying data in the case where the data are homogeneously-typed. @@ -121,12 +128,16 @@ from the hierarchical indexing section: .. ipython:: python - tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', - 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', - 'one', 'two', 'one', 'two']])) - index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) - df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B']) + tuples = list( + zip( + *[ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + ) + ) + index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) + df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=["A", "B"]) df2 = df[:4] df2 @@ -163,7 +174,7 @@ the level numbers: .. ipython:: python - stacked.unstack('second') + stacked.unstack("second") .. image:: ../_static/reshaping_unstack_0.png @@ -174,8 +185,8 @@ will result in a **sorted** copy of the original ``DataFrame`` or ``Series``: .. ipython:: python - index = pd.MultiIndex.from_product([[2, 1], ['a', 'b']]) - df = pd.DataFrame(np.random.randn(4), index=index, columns=['A']) + index = pd.MultiIndex.from_product([[2, 1], ["a", "b"]]) + df = pd.DataFrame(np.random.randn(4), index=index, columns=["A"]) df all(df.unstack().stack() == df.sort_index()) @@ -193,15 +204,19 @@ processed individually. .. ipython:: python - columns = pd.MultiIndex.from_tuples([ - ('A', 'cat', 'long'), ('B', 'cat', 'long'), - ('A', 'dog', 'short'), ('B', 'dog', 'short')], - names=['exp', 'animal', 'hair_length'] + columns = pd.MultiIndex.from_tuples( + [ + ("A", "cat", "long"), + ("B", "cat", "long"), + ("A", "dog", "short"), + ("B", "dog", "short"), + ], + names=["exp", "animal", "hair_length"], ) df = pd.DataFrame(np.random.randn(4, 4), columns=columns) df - df.stack(level=['animal', 'hair_length']) + df.stack(level=["animal", "hair_length"]) The list of levels can contain either level names or level numbers (but not a mixture of the two). @@ -222,12 +237,12 @@ calling ``sort_index``, of course). Here is a more complex example: .. ipython:: python - columns = pd.MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), - ('B', 'cat'), ('A', 'dog')], - names=['exp', 'animal']) - index = pd.MultiIndex.from_product([('bar', 'baz', 'foo', 'qux'), - ('one', 'two')], - names=['first', 'second']) + columns = pd.MultiIndex.from_tuples( + [("A", "cat"), ("B", "dog"), ("B", "cat"), ("A", "dog")], names=["exp", "animal"] + ) + index = pd.MultiIndex.from_product( + [("bar", "baz", "foo", "qux"), ("one", "two")], names=["first", "second"] + ) df = pd.DataFrame(np.random.randn(8, 4), index=index, columns=columns) df2 = df.iloc[[0, 1, 2, 4, 5, 7]] df2 @@ -237,8 +252,8 @@ which level in the columns to stack: .. ipython:: python - df2.stack('exp') - df2.stack('animal') + df2.stack("exp") + df2.stack("animal") Unstacking can result in missing values if subgroups do not have the same set of labels. By default, missing values will be replaced with the default @@ -288,13 +303,17 @@ For instance, .. ipython:: python - cheese = pd.DataFrame({'first': ['John', 'Mary'], - 'last': ['Doe', 'Bo'], - 'height': [5.5, 6.0], - 'weight': [130, 150]}) + cheese = pd.DataFrame( + { + "first": ["John", "Mary"], + "last": ["Doe", "Bo"], + "height": [5.5, 6.0], + "weight": [130, 150], + } + ) cheese - cheese.melt(id_vars=['first', 'last']) - cheese.melt(id_vars=['first', 'last'], var_name='quantity') + cheese.melt(id_vars=["first", "last"]) + cheese.melt(id_vars=["first", "last"], var_name="quantity") When transforming a DataFrame using :func:`~pandas.melt`, the index will be ignored. The original index values can be kept around by setting the ``ignore_index`` parameter to ``False`` (default is ``True``). This will however duplicate them. @@ -302,15 +321,19 @@ When transforming a DataFrame using :func:`~pandas.melt`, the index will be igno .. ipython:: python - index = pd.MultiIndex.from_tuples([('person', 'A'), ('person', 'B')]) - cheese = pd.DataFrame({'first': ['John', 'Mary'], - 'last': ['Doe', 'Bo'], - 'height': [5.5, 6.0], - 'weight': [130, 150]}, - index=index) + index = pd.MultiIndex.from_tuples([("person", "A"), ("person", "B")]) + cheese = pd.DataFrame( + { + "first": ["John", "Mary"], + "last": ["Doe", "Bo"], + "height": [5.5, 6.0], + "weight": [130, 150], + }, + index=index, + ) cheese - cheese.melt(id_vars=['first', 'last']) - cheese.melt(id_vars=['first', 'last'], ignore_index=False) + cheese.melt(id_vars=["first", "last"]) + cheese.melt(id_vars=["first", "last"], ignore_index=False) Another way to transform is to use the :func:`~pandas.wide_to_long` panel data convenience function. It is less flexible than :func:`~pandas.melt`, but more @@ -318,12 +341,15 @@ user-friendly. .. ipython:: python - dft = pd.DataFrame({"A1970": {0: "a", 1: "b", 2: "c"}, - "A1980": {0: "d", 1: "e", 2: "f"}, - "B1970": {0: 2.5, 1: 1.2, 2: .7}, - "B1980": {0: 3.2, 1: 1.3, 2: .1}, - "X": dict(zip(range(3), np.random.randn(3))) - }) + dft = pd.DataFrame( + { + "A1970": {0: "a", 1: "b", 2: "c"}, + "A1980": {0: "d", 1: "e", 2: "f"}, + "B1970": {0: 2.5, 1: 1.2, 2: 0.7}, + "B1980": {0: 3.2, 1: 1.3, 2: 0.1}, + "X": dict(zip(range(3), np.random.randn(3))), + } + ) dft["id"] = dft.index dft pd.wide_to_long(dft, ["A", "B"], i="id", j="year") @@ -380,23 +406,27 @@ Consider a data set like this: .. ipython:: python import datetime - df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 6, - 'B': ['A', 'B', 'C'] * 8, - 'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 4, - 'D': np.random.randn(24), - 'E': np.random.randn(24), - 'F': [datetime.datetime(2013, i, 1) for i in range(1, 13)] - + [datetime.datetime(2013, i, 15) for i in range(1, 13)]}) + + df = pd.DataFrame( + { + "A": ["one", "one", "two", "three"] * 6, + "B": ["A", "B", "C"] * 8, + "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, + "D": np.random.randn(24), + "E": np.random.randn(24), + "F": [datetime.datetime(2013, i, 1) for i in range(1, 13)] + + [datetime.datetime(2013, i, 15) for i in range(1, 13)], + } + ) df We can produce pivot tables from this data very easily: .. ipython:: python - pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']) - pd.pivot_table(df, values='D', index=['B'], columns=['A', 'C'], aggfunc=np.sum) - pd.pivot_table(df, values=['D', 'E'], index=['B'], columns=['A', 'C'], - aggfunc=np.sum) + pd.pivot_table(df, values="D", index=["A", "B"], columns=["C"]) + pd.pivot_table(df, values="D", index=["B"], columns=["A", "C"], aggfunc=np.sum) + pd.pivot_table(df, values=["D", "E"], index=["B"], columns=["A", "C"], aggfunc=np.sum) The result object is a ``DataFrame`` having potentially hierarchical indexes on the rows and columns. If the ``values`` column name is not given, the pivot table @@ -405,22 +435,21 @@ hierarchy in the columns: .. ipython:: python - pd.pivot_table(df, index=['A', 'B'], columns=['C']) + pd.pivot_table(df, index=["A", "B"], columns=["C"]) Also, you can use ``Grouper`` for ``index`` and ``columns`` keywords. For detail of ``Grouper``, see :ref:`Grouping with a Grouper specification `. .. ipython:: python - pd.pivot_table(df, values='D', index=pd.Grouper(freq='M', key='F'), - columns='C') + pd.pivot_table(df, values="D", index=pd.Grouper(freq="M", key="F"), columns="C") You can render a nice output of the table omitting the missing values by calling ``to_string`` if you wish: .. ipython:: python - table = pd.pivot_table(df, index=['A', 'B'], columns=['C']) - print(table.to_string(na_rep='')) + table = pd.pivot_table(df, index=["A", "B"], columns=["C"]) + print(table.to_string(na_rep="")) Note that ``pivot_table`` is also available as an instance method on DataFrame, i.e. :meth:`DataFrame.pivot_table`. @@ -436,7 +465,7 @@ rows and columns: .. ipython:: python - df.pivot_table(index=['A', 'B'], columns='C', margins=True, aggfunc=np.std) + df.pivot_table(index=["A", "B"], columns="C", margins=True, aggfunc=np.std) .. _reshaping.crosstabulations: @@ -470,30 +499,31 @@ For example: .. ipython:: python - foo, bar, dull, shiny, one, two = 'foo', 'bar', 'dull', 'shiny', 'one', 'two' + foo, bar, dull, shiny, one, two = "foo", "bar", "dull", "shiny", "one", "two" a = np.array([foo, foo, bar, bar, foo, foo], dtype=object) b = np.array([one, one, two, one, two, one], dtype=object) c = np.array([dull, dull, shiny, dull, dull, shiny], dtype=object) - pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) + pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) If ``crosstab`` receives only two Series, it will provide a frequency table. .. ipython:: python - df = pd.DataFrame({'A': [1, 2, 2, 2, 2], 'B': [3, 3, 4, 4, 4], - 'C': [1, 1, np.nan, 1, 1]}) + df = pd.DataFrame( + {"A": [1, 2, 2, 2, 2], "B": [3, 3, 4, 4, 4], "C": [1, 1, np.nan, 1, 1]} + ) df - pd.crosstab(df['A'], df['B']) + pd.crosstab(df["A"], df["B"]) ``crosstab`` can also be implemented to ``Categorical`` data. .. ipython:: python - foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) - bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) + foo = pd.Categorical(["a", "b"], categories=["a", "b", "c"]) + bar = pd.Categorical(["d", "e"], categories=["d", "e", "f"]) pd.crosstab(foo, bar) If you want to include **all** of data categories even if the actual data does @@ -513,13 +543,13 @@ using the ``normalize`` argument: .. ipython:: python - pd.crosstab(df['A'], df['B'], normalize=True) + pd.crosstab(df["A"], df["B"], normalize=True) ``normalize`` can also normalize values within each row or within each column: .. ipython:: python - pd.crosstab(df['A'], df['B'], normalize='columns') + pd.crosstab(df["A"], df["B"], normalize="columns") ``crosstab`` can also be passed a third ``Series`` and an aggregation function (``aggfunc``) that will be applied to the values of the third ``Series`` within @@ -527,7 +557,7 @@ each group defined by the first two ``Series``: .. ipython:: python - pd.crosstab(df['A'], df['B'], values=df['C'], aggfunc=np.sum) + pd.crosstab(df["A"], df["B"], values=df["C"], aggfunc=np.sum) Adding margins ~~~~~~~~~~~~~~ @@ -536,8 +566,9 @@ Finally, one can also add margins or normalize this output. .. ipython:: python - pd.crosstab(df['A'], df['B'], values=df['C'], aggfunc=np.sum, normalize=True, - margins=True) + pd.crosstab( + df["A"], df["B"], values=df["C"], aggfunc=np.sum, normalize=True, margins=True + ) .. _reshaping.tile: .. _reshaping.tile.cut: @@ -581,19 +612,19 @@ values, can derive a ``DataFrame`` containing ``k`` columns of 1s and 0s using .. ipython:: python - df = pd.DataFrame({'key': list('bbacab'), 'data1': range(6)}) + df = pd.DataFrame({"key": list("bbacab"), "data1": range(6)}) - pd.get_dummies(df['key']) + pd.get_dummies(df["key"]) Sometimes it's useful to prefix the column names, for example when merging the result with the original ``DataFrame``: .. ipython:: python - dummies = pd.get_dummies(df['key'], prefix='key') + dummies = pd.get_dummies(df["key"], prefix="key") dummies - df[['data1']].join(dummies) + df[["data1"]].join(dummies) This function is often used along with discretization functions like ``cut``: @@ -615,8 +646,7 @@ variables (categorical in the statistical sense, those with ``object`` or .. ipython:: python - df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['c', 'c', 'b'], - 'C': [1, 2, 3]}) + df = pd.DataFrame({"A": ["a", "b", "a"], "B": ["c", "c", "b"], "C": [1, 2, 3]}) pd.get_dummies(df) All non-object columns are included untouched in the output. You can control @@ -624,7 +654,7 @@ the columns that are encoded with the ``columns`` keyword. .. ipython:: python - pd.get_dummies(df, columns=['A']) + pd.get_dummies(df, columns=["A"]) Notice that the ``B`` column is still included in the output, it just hasn't been encoded. You can drop ``B`` before calling ``get_dummies`` if you don't @@ -641,11 +671,11 @@ the prefix separator. You can specify ``prefix`` and ``prefix_sep`` in 3 ways: .. ipython:: python - simple = pd.get_dummies(df, prefix='new_prefix') + simple = pd.get_dummies(df, prefix="new_prefix") simple - from_list = pd.get_dummies(df, prefix=['from_A', 'from_B']) + from_list = pd.get_dummies(df, prefix=["from_A", "from_B"]) from_list - from_dict = pd.get_dummies(df, prefix={'B': 'from_B', 'A': 'from_A'}) + from_dict = pd.get_dummies(df, prefix={"B": "from_B", "A": "from_A"}) from_dict Sometimes it will be useful to only keep k-1 levels of a categorical @@ -654,7 +684,7 @@ You can switch to this mode by turn on ``drop_first``. .. ipython:: python - s = pd.Series(list('abcaa')) + s = pd.Series(list("abcaa")) pd.get_dummies(s) @@ -664,7 +694,7 @@ When a column contains only one level, it will be omitted in the result. .. ipython:: python - df = pd.DataFrame({'A': list('aaaaa'), 'B': list('ababc')}) + df = pd.DataFrame({"A": list("aaaaa"), "B": list("ababc")}) pd.get_dummies(df) @@ -675,7 +705,7 @@ To choose another dtype, use the ``dtype`` argument: .. ipython:: python - df = pd.DataFrame({'A': list('abc'), 'B': [1.1, 2.2, 3.3]}) + df = pd.DataFrame({"A": list("abc"), "B": [1.1, 2.2, 3.3]}) pd.get_dummies(df, dtype=bool).dtypes @@ -689,7 +719,7 @@ To encode 1-d values as an enumerated type use :func:`~pandas.factorize`: .. ipython:: python - x = pd.Series(['A', 'A', np.nan, 'B', 3.14, np.inf]) + x = pd.Series(["A", "A", np.nan, "B", 3.14, np.inf]) x labels, uniques = pd.factorize(x) labels @@ -733,11 +763,12 @@ DataFrame will be pivoted in the answers below. np.random.seed([3, 1415]) n = 20 - cols = np.array(['key', 'row', 'item', 'col']) - df = cols + pd.DataFrame((np.random.randint(5, size=(n, 4)) - // [2, 1, 2, 1]).astype(str)) + cols = np.array(["key", "row", "item", "col"]) + df = cols + pd.DataFrame( + (np.random.randint(5, size=(n, 4)) // [2, 1, 2, 1]).astype(str) + ) df.columns = cols - df = df.join(pd.DataFrame(np.random.rand(n, 2).round(2)).add_prefix('val')) + df = df.join(pd.DataFrame(np.random.rand(n, 2).round(2)).add_prefix("val")) df @@ -762,24 +793,21 @@ This solution uses :func:`~pandas.pivot_table`. Also note that .. ipython:: python - df.pivot_table( - values='val0', index='row', columns='col', aggfunc='mean') + df.pivot_table(values="val0", index="row", columns="col", aggfunc="mean") Note that we can also replace the missing values by using the ``fill_value`` parameter. .. ipython:: python - df.pivot_table( - values='val0', index='row', columns='col', aggfunc='mean', fill_value=0) + df.pivot_table(values="val0", index="row", columns="col", aggfunc="mean", fill_value=0) Also note that we can pass in other aggregation functions as well. For example, we can also pass in ``sum``. .. ipython:: python - df.pivot_table( - values='val0', index='row', columns='col', aggfunc='sum', fill_value=0) + df.pivot_table(values="val0", index="row", columns="col", aggfunc="sum", fill_value=0) Another aggregation we can do is calculate the frequency in which the columns and rows occur together a.k.a. "cross tabulation". To do this, we can pass @@ -787,7 +815,7 @@ and rows occur together a.k.a. "cross tabulation". To do this, we can pass .. ipython:: python - df.pivot_table(index='row', columns='col', fill_value=0, aggfunc='size') + df.pivot_table(index="row", columns="col", fill_value=0, aggfunc="size") Pivoting with multiple aggregations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -797,24 +825,21 @@ We can also perform multiple aggregations. For example, to perform both a .. ipython:: python - df.pivot_table( - values='val0', index='row', columns='col', aggfunc=['mean', 'sum']) + df.pivot_table(values="val0", index="row", columns="col", aggfunc=["mean", "sum"]) Note to aggregate over multiple value columns, we can pass in a list to the ``values`` parameter. .. ipython:: python - df.pivot_table( - values=['val0', 'val1'], index='row', columns='col', aggfunc=['mean']) + df.pivot_table(values=["val0", "val1"], index="row", columns="col", aggfunc=["mean"]) Note to subdivide over multiple columns we can pass in a list to the ``columns`` parameter. .. ipython:: python - df.pivot_table( - values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean']) + df.pivot_table(values=["val0"], index="row", columns=["item", "col"], aggfunc=["mean"]) .. _reshaping.explode: @@ -827,28 +852,28 @@ Sometimes the values in a column are list-like. .. ipython:: python - keys = ['panda1', 'panda2', 'panda3'] - values = [['eats', 'shoots'], ['shoots', 'leaves'], ['eats', 'leaves']] - df = pd.DataFrame({'keys': keys, 'values': values}) + keys = ["panda1", "panda2", "panda3"] + values = [["eats", "shoots"], ["shoots", "leaves"], ["eats", "leaves"]] + df = pd.DataFrame({"keys": keys, "values": values}) df We can 'explode' the ``values`` column, transforming each list-like to a separate row, by using :meth:`~Series.explode`. This will replicate the index values from the original row: .. ipython:: python - df['values'].explode() + df["values"].explode() You can also explode the column in the ``DataFrame``. .. ipython:: python - df.explode('values') + df.explode("values") :meth:`Series.explode` will replace empty lists with ``np.nan`` and preserve scalar entries. The dtype of the resulting ``Series`` is always ``object``. .. ipython:: python - s = pd.Series([[1, 2, 3], 'foo', [], ['a', 'b']]) + s = pd.Series([[1, 2, 3], "foo", [], ["a", "b"]]) s s.explode() @@ -856,12 +881,11 @@ Here is a typical usecase. You have comma separated strings in a column and want .. ipython:: python - df = pd.DataFrame([{'var1': 'a,b,c', 'var2': 1}, - {'var1': 'd,e,f', 'var2': 2}]) + df = pd.DataFrame([{"var1": "a,b,c", "var2": 1}, {"var1": "d,e,f", "var2": 2}]) df Creating a long form DataFrame is now straightforward using explode and chained operations .. ipython:: python - df.assign(var1=df.var1.str.split(',')).explode('var1') + df.assign(var1=df.var1.str.split(",")).explode("var1") diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst index 3979ad1f3e949..971a415088220 100644 --- a/doc/source/user_guide/timedeltas.rst +++ b/doc/source/user_guide/timedeltas.rst @@ -25,33 +25,33 @@ You can construct a ``Timedelta`` scalar through various arguments, including `I import datetime # strings - pd.Timedelta('1 days') - pd.Timedelta('1 days 00:00:00') - pd.Timedelta('1 days 2 hours') - pd.Timedelta('-1 days 2 min 3us') + pd.Timedelta("1 days") + pd.Timedelta("1 days 00:00:00") + pd.Timedelta("1 days 2 hours") + pd.Timedelta("-1 days 2 min 3us") # like datetime.timedelta # note: these MUST be specified as keyword arguments pd.Timedelta(days=1, seconds=1) # integers with a unit - pd.Timedelta(1, unit='d') + pd.Timedelta(1, unit="d") # from a datetime.timedelta/np.timedelta64 pd.Timedelta(datetime.timedelta(days=1, seconds=1)) - pd.Timedelta(np.timedelta64(1, 'ms')) + pd.Timedelta(np.timedelta64(1, "ms")) # negative Timedeltas have this string repr # to be more consistent with datetime.timedelta conventions - pd.Timedelta('-1us') + pd.Timedelta("-1us") # a NaT - pd.Timedelta('nan') - pd.Timedelta('nat') + pd.Timedelta("nan") + pd.Timedelta("nat") # ISO 8601 Duration strings - pd.Timedelta('P0DT0H1M0S') - pd.Timedelta('P0DT0H0M0.000000123S') + pd.Timedelta("P0DT0H1M0S") + pd.Timedelta("P0DT0H0M0.000000123S") :ref:`DateOffsets` (``Day, Hour, Minute, Second, Milli, Micro, Nano``) can also be used in construction. @@ -63,8 +63,9 @@ Further, operations among the scalars yield another scalar ``Timedelta``. .. ipython:: python - pd.Timedelta(pd.offsets.Day(2)) + pd.Timedelta(pd.offsets.Second(2)) +\ - pd.Timedelta('00:00:00.000123') + pd.Timedelta(pd.offsets.Day(2)) + pd.Timedelta(pd.offsets.Second(2)) + pd.Timedelta( + "00:00:00.000123" + ) to_timedelta ~~~~~~~~~~~~ @@ -78,21 +79,21 @@ You can parse a single string to a Timedelta: .. ipython:: python - pd.to_timedelta('1 days 06:05:01.00003') - pd.to_timedelta('15.5us') + pd.to_timedelta("1 days 06:05:01.00003") + pd.to_timedelta("15.5us") or a list/array of strings: .. ipython:: python - pd.to_timedelta(['1 days 06:05:01.00003', '15.5us', 'nan']) + pd.to_timedelta(["1 days 06:05:01.00003", "15.5us", "nan"]) The ``unit`` keyword argument specifies the unit of the Timedelta: .. ipython:: python - pd.to_timedelta(np.arange(5), unit='s') - pd.to_timedelta(np.arange(5), unit='d') + pd.to_timedelta(np.arange(5), unit="s") + pd.to_timedelta(np.arange(5), unit="d") .. _timedeltas.limitations: @@ -118,11 +119,11 @@ subtraction operations on ``datetime64[ns]`` Series, or ``Timestamps``. .. ipython:: python - s = pd.Series(pd.date_range('2012-1-1', periods=3, freq='D')) + s = pd.Series(pd.date_range("2012-1-1", periods=3, freq="D")) td = pd.Series([pd.Timedelta(days=i) for i in range(3)]) - df = pd.DataFrame({'A': s, 'B': td}) + df = pd.DataFrame({"A": s, "B": td}) df - df['C'] = df['A'] + df['B'] + df["C"] = df["A"] + df["B"] df df.dtypes @@ -165,10 +166,10 @@ Operands can also appear in a reversed order (a singular object operated with a .. ipython:: python - A = s - pd.Timestamp('20120101') - pd.Timedelta('00:05:05') - B = s - pd.Series(pd.date_range('2012-1-2', periods=3, freq='D')) + A = s - pd.Timestamp("20120101") - pd.Timedelta("00:05:05") + B = s - pd.Series(pd.date_range("2012-1-2", periods=3, freq="D")) - df = pd.DataFrame({'A': A, 'B': B}) + df = pd.DataFrame({"A": A, "B": B}) df df.min() @@ -192,17 +193,17 @@ You can fillna on timedeltas, passing a timedelta to get a particular value. .. ipython:: python y.fillna(pd.Timedelta(0)) - y.fillna(pd.Timedelta(10, unit='s')) - y.fillna(pd.Timedelta('-1 days, 00:00:05')) + y.fillna(pd.Timedelta(10, unit="s")) + y.fillna(pd.Timedelta("-1 days, 00:00:05")) You can also negate, multiply and use ``abs`` on ``Timedeltas``: .. ipython:: python - td1 = pd.Timedelta('-1 days 2 hours 3 seconds') + td1 = pd.Timedelta("-1 days 2 hours 3 seconds") td1 -1 * td1 - - td1 + -td1 abs(td1) .. _timedeltas.timedeltas_reductions: @@ -215,12 +216,13 @@ Numeric reduction operation for ``timedelta64[ns]`` will return ``Timedelta`` ob .. ipython:: python - y2 = pd.Series(pd.to_timedelta(['-1 days +00:00:05', 'nat', - '-1 days +00:00:05', '1 days'])) + y2 = pd.Series( + pd.to_timedelta(["-1 days +00:00:05", "nat", "-1 days +00:00:05", "1 days"]) + ) y2 y2.mean() y2.median() - y2.quantile(.1) + y2.quantile(0.1) y2.sum() .. _timedeltas.timedeltas_convert: @@ -234,8 +236,8 @@ Note that division by the NumPy scalar is true division, while astyping is equiv .. ipython:: python - december = pd.Series(pd.date_range('20121201', periods=4)) - january = pd.Series(pd.date_range('20130101', periods=4)) + december = pd.Series(pd.date_range("20121201", periods=4)) + january = pd.Series(pd.date_range("20130101", periods=4)) td = january - december td[2] += datetime.timedelta(minutes=5, seconds=3) @@ -243,15 +245,15 @@ Note that division by the NumPy scalar is true division, while astyping is equiv td # to days - td / np.timedelta64(1, 'D') - td.astype('timedelta64[D]') + td / np.timedelta64(1, "D") + td.astype("timedelta64[D]") # to seconds - td / np.timedelta64(1, 's') - td.astype('timedelta64[s]') + td / np.timedelta64(1, "s") + td.astype("timedelta64[s]") # to months (these are constant months) - td / np.timedelta64(1, 'M') + td / np.timedelta64(1, "M") Dividing or multiplying a ``timedelta64[ns]`` Series by an integer or integer Series yields another ``timedelta64[ns]`` dtypes Series. @@ -305,7 +307,7 @@ You can access the value of the fields for a scalar ``Timedelta`` directly. .. ipython:: python - tds = pd.Timedelta('31 days 5 min 3 sec') + tds = pd.Timedelta("31 days 5 min 3 sec") tds.days tds.seconds (-tds).seconds @@ -325,9 +327,9 @@ You can convert a ``Timedelta`` to an `ISO 8601 Duration`_ string with the .. ipython:: python - pd.Timedelta(days=6, minutes=50, seconds=3, - milliseconds=10, microseconds=10, - nanoseconds=12).isoformat() + pd.Timedelta( + days=6, minutes=50, seconds=3, milliseconds=10, microseconds=10, nanoseconds=12 + ).isoformat() .. _ISO 8601 Duration: https://en.wikipedia.org/wiki/ISO_8601#Durations @@ -344,15 +346,21 @@ or ``np.timedelta64`` objects. Passing ``np.nan/pd.NaT/nat`` will represent miss .. ipython:: python - pd.TimedeltaIndex(['1 days', '1 days, 00:00:05', np.timedelta64(2, 'D'), - datetime.timedelta(days=2, seconds=2)]) + pd.TimedeltaIndex( + [ + "1 days", + "1 days, 00:00:05", + np.timedelta64(2, "D"), + datetime.timedelta(days=2, seconds=2), + ] + ) The string 'infer' can be passed in order to set the frequency of the index as the inferred frequency upon creation: .. ipython:: python - pd.TimedeltaIndex(['0 days', '10 days', '20 days'], freq='infer') + pd.TimedeltaIndex(["0 days", "10 days", "20 days"], freq="infer") Generating ranges of time deltas ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -363,24 +371,24 @@ calendar day: .. ipython:: python - pd.timedelta_range(start='1 days', periods=5) + pd.timedelta_range(start="1 days", periods=5) Various combinations of ``start``, ``end``, and ``periods`` can be used with ``timedelta_range``: .. ipython:: python - pd.timedelta_range(start='1 days', end='5 days') + pd.timedelta_range(start="1 days", end="5 days") - pd.timedelta_range(end='10 days', periods=4) + pd.timedelta_range(end="10 days", periods=4) The ``freq`` parameter can passed a variety of :ref:`frequency aliases `: .. ipython:: python - pd.timedelta_range(start='1 days', end='2 days', freq='30T') + pd.timedelta_range(start="1 days", end="2 days", freq="30T") - pd.timedelta_range(start='1 days', periods=5, freq='2D5H') + pd.timedelta_range(start="1 days", periods=5, freq="2D5H") Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced @@ -389,9 +397,9 @@ in the resulting ``TimedeltaIndex``: .. ipython:: python - pd.timedelta_range('0 days', '4 days', periods=5) + pd.timedelta_range("0 days", "4 days", periods=5) - pd.timedelta_range('0 days', '4 days', periods=10) + pd.timedelta_range("0 days", "4 days", periods=10) Using the TimedeltaIndex ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -401,23 +409,22 @@ Similarly to other of the datetime-like indices, ``DatetimeIndex`` and ``PeriodI .. ipython:: python - s = pd.Series(np.arange(100), - index=pd.timedelta_range('1 days', periods=100, freq='h')) + s = pd.Series(np.arange(100), index=pd.timedelta_range("1 days", periods=100, freq="h")) s Selections work similarly, with coercion on string-likes and slices: .. ipython:: python - s['1 day':'2 day'] - s['1 day 01:00:00'] - s[pd.Timedelta('1 day 1h')] + s["1 day":"2 day"] + s["1 day 01:00:00"] + s[pd.Timedelta("1 day 1h")] Furthermore you can use partial string selection and the range will be inferred: .. ipython:: python - s['1 day':'1 day 5 hours'] + s["1 day":"1 day 5 hours"] Operations ~~~~~~~~~~ @@ -426,9 +433,9 @@ Finally, the combination of ``TimedeltaIndex`` with ``DatetimeIndex`` allow cert .. ipython:: python - tdi = pd.TimedeltaIndex(['1 days', pd.NaT, '2 days']) + tdi = pd.TimedeltaIndex(["1 days", pd.NaT, "2 days"]) tdi.to_list() - dti = pd.date_range('20130101', periods=3) + dti = pd.date_range("20130101", periods=3) dti.to_list() (dti + tdi).to_list() (dti - tdi).to_list() @@ -440,22 +447,22 @@ Similarly to frequency conversion on a ``Series`` above, you can convert these i .. ipython:: python - tdi / np.timedelta64(1, 's') - tdi.astype('timedelta64[s]') + tdi / np.timedelta64(1, "s") + tdi.astype("timedelta64[s]") Scalars type ops work as well. These can potentially return a *different* type of index. .. ipython:: python # adding or timedelta and date -> datelike - tdi + pd.Timestamp('20130101') + tdi + pd.Timestamp("20130101") # subtraction of a date and a timedelta -> datelike # note that trying to subtract a date from a Timedelta will raise an exception - (pd.Timestamp('20130101') - tdi).to_list() + (pd.Timestamp("20130101") - tdi).to_list() # timedelta + timedelta -> timedelta - tdi + pd.Timedelta('10 days') + tdi + pd.Timedelta("10 days") # division can result in a Timedelta if the divisor is an integer tdi / 2 @@ -472,4 +479,4 @@ Similar to :ref:`timeseries resampling `, we can resample .. ipython:: python - s.resample('D').mean() + s.resample("D").mean() diff --git a/setup.cfg b/setup.cfg index d938d2ef3972a..656bd82a2b65e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -33,6 +33,7 @@ exclude = env # exclude asv benchmark environments from linting [flake8-rst] +max-line-length = 88 bootstrap = import numpy as np import pandas as pd