diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index bc31d362118b5..49200523df40f 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -2,7 +2,7 @@ Whether you are a novice or experienced software developer, all contributions and suggestions are welcome! -Our main contributing guide can be found [in this repo](https://github.com/pandas-dev/pandas/blob/master/doc/source/development/contributing.rst) or [on the website](https://pandas-docs.github.io/pandas-docs-travis/development/contributing.html). If you do not want to read it in its entirety, we will summarize the main ways in which you can contribute and point to relevant sections of that document for further information. +Our main contributing guide can be found [in this repo](https://github.com/pandas-dev/pandas/blob/master/doc/source/development/contributing.rst) or [on the website](https://pandas.pydata.org/docs/dev/development/contributing.html). If you do not want to read it in its entirety, we will summarize the main ways in which you can contribute and point to relevant sections of that document for further information. ## Getting Started diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md deleted file mode 100644 index e33835c462511..0000000000000 --- a/.github/ISSUE_TEMPLATE.md +++ /dev/null @@ -1,29 +0,0 @@ -#### Code Sample, a copy-pastable example if possible - -```python -# Your code here - -``` -#### Problem description - -[this should explain **why** the current behaviour is a problem and why the expected output is a better solution.] - -**Note**: We receive a lot of issues on our GitHub tracker, so it is very possible that your issue has been posted before. Please check first before submitting so that we do not have to handle and close duplicates! - -**Note**: Many problems can be resolved by simply upgrading `pandas` to the latest version. Before submitting, please check if that solution works for you. If possible, you may want to check if `master` addresses this issue, but that is not necessary. - -For documentation-related issues, you can check the latest versions of the docs on `master` here: - -https://pandas-docs.github.io/pandas-docs-travis/ - -If the issue has not been resolved there, go ahead and file it in the issue tracker. - -#### Expected Output - -#### Output of ``pd.show_versions()`` - -
- -[paste the output of ``pd.show_versions()`` here below this line] - -
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000000000..765c1b8bff62e --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,39 @@ +--- + +name: Bug Report +about: Create a bug report to help us improve pandas +title: "BUG:" +labels: "Bug, Needs Triage" + +--- + +- [ ] I have checked that this issue has not already been reported. + +- [ ] I have confirmed this bug exists on the latest version of pandas. + +- [ ] (optional) I have confirmed this bug exists on the master branch of pandas. + +--- + +**Note**: Please read [this guide](https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) detailing how to provide the necessary information for us to reproduce your bug. + +#### Code Sample, a copy-pastable example + +```python +# Your code here + +``` + +#### Problem description + +[this should explain **why** the current behaviour is a problem and why the expected output is a better solution] + +#### Expected Output + +#### Output of ``pd.show_versions()`` + +
+ +[paste the output of ``pd.show_versions()`` here leaving a blank line after the details tag] + +
diff --git a/.github/ISSUE_TEMPLATE/documentation_improvement.md b/.github/ISSUE_TEMPLATE/documentation_improvement.md new file mode 100644 index 0000000000000..32d5612767a8c --- /dev/null +++ b/.github/ISSUE_TEMPLATE/documentation_improvement.md @@ -0,0 +1,22 @@ +--- + +name: Documentation Improvement +about: Report wrong or missing documentation +title: "DOC:" +labels: "Docs, Needs Triage" + +--- + +#### Location of the documentation + +[this should provide the location of the documentation, e.g. "pandas.read_csv" or the URL of the documentation, e.g. "https://dev.pandas.io/docs/reference/api/pandas.read_csv.html"] + +**Note**: You can check the latest versions of the docs on `master` [here](https://pandas.pydata.org/docs/dev/). + +#### Documentation problem + +[this should provide a description of what documentation you believe needs to be fixed/improved] + +#### Suggested fix for documentation + +[this should explain the suggested fix and **why** it's better than the existing documentation] diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000000000..0c30b941bc520 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,33 @@ +--- + +name: Feature Request +about: Suggest an idea for pandas +title: "ENH:" +labels: "Enhancement, Needs Triage" + +--- + +#### Is your feature request related to a problem? + +[this should provide a description of what the problem is, e.g. "I wish I could use pandas to do [...]"] + +#### Describe the solution you'd like + +[this should provide a description of the feature request, e.g. "`DataFrame.foo` should get a new parameter `bar` that [...]", try to write a docstring for the desired feature] + +#### API breaking implications + +[this should provide a description of how this feature will affect the API] + +#### Describe alternatives you've considered + +[this should provide a description of any alternative solutions or features you've considered] + +#### Additional context + +[add any other context, code examples, or references to existing implementations about the feature request here] + +```python +# Your code here, if applicable + +``` diff --git a/.github/ISSUE_TEMPLATE/submit_question.md b/.github/ISSUE_TEMPLATE/submit_question.md new file mode 100644 index 0000000000000..9b48918ff2f6d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/submit_question.md @@ -0,0 +1,24 @@ +--- + +name: Submit Question +about: Ask a general question about pandas +title: "QST:" +labels: "Usage Question, Needs Triage" + +--- + +- [ ] I have searched the [[pandas] tag](https://stackoverflow.com/questions/tagged/pandas) on StackOverflow for similar questions. + +- [ ] I have asked my usage related question on [StackOverflow](https://stackoverflow.com). + +--- + +#### Question about pandas + +**Note**: If you'd still like to submit a question, please read [this guide]( +https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) detailing how to provide the necessary information for us to reproduce your question. + +```python +# Your code here, if applicable + +``` diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d87fa5203bd52..db1fc30111a2d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -125,35 +125,18 @@ jobs: - name: Check ipython directive errors run: "! grep -B1 \"^<<<-------------------------------------------------------------------------$\" sphinx.log" - - name: Merge website and docs + - name: Install ssh key run: | - mkdir -p pandas_web/docs - cp -r web/build/* pandas_web/ - cp -r doc/build/html/* pandas_web/docs/ + mkdir -m 700 -p ~/.ssh + echo "${{ secrets.server_ssh_key }}" > ~/.ssh/id_rsa + chmod 600 ~/.ssh/id_rsa + echo "${{ secrets.server_ip }} ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBE1Kkopomm7FHG5enATf7SgnpICZ4W2bw+Ho+afqin+w7sMcrsa0je7sbztFAV8YchDkiBKnWTG4cRT+KZgZCaY=" > ~/.ssh/known_hosts if: github.event_name == 'push' - - name: Install Rclone - run: sudo apt install rclone -y + - name: Upload web + run: rsync -az --delete --exclude='pandas-docs' --exclude='docs' --exclude='Pandas_Cheat_Sheet*' web/build/ docs@${{ secrets.server_ip }}:/usr/share/nginx/pandas if: github.event_name == 'push' - - name: Set up Rclone - run: | - RCLONE_CONFIG_PATH=$HOME/.config/rclone/rclone.conf - mkdir -p `dirname $RCLONE_CONFIG_PATH` - echo "[ovh_cloud_pandas_web]" > $RCLONE_CONFIG_PATH - echo "type = swift" >> $RCLONE_CONFIG_PATH - echo "env_auth = false" >> $RCLONE_CONFIG_PATH - echo "auth_version = 3" >> $RCLONE_CONFIG_PATH - echo "auth = https://auth.cloud.ovh.net/v3/" >> $RCLONE_CONFIG_PATH - echo "endpoint_type = public" >> $RCLONE_CONFIG_PATH - echo "tenant_domain = default" >> $RCLONE_CONFIG_PATH - echo "tenant = 2977553886518025" >> $RCLONE_CONFIG_PATH - echo "domain = default" >> $RCLONE_CONFIG_PATH - echo "user = w4KGs3pmDxpd" >> $RCLONE_CONFIG_PATH - echo "key = ${{ secrets.ovh_object_store_key }}" >> $RCLONE_CONFIG_PATH - echo "region = BHS" >> $RCLONE_CONFIG_PATH - if: github.event_name == 'push' - - - name: Sync web - run: rclone sync pandas_web ovh_cloud_pandas_web:dev + - name: Upload dev docs + run: rsync -az --delete doc/build/html/ docs@${{ secrets.server_ip }}:/usr/share/nginx/pandas/pandas-docs/dev if: github.event_name == 'push' diff --git a/LICENSES/HAVEN_LICENSE b/LICENSES/HAVEN_LICENSE index 2f444cb44d505..ce1b07b783e74 100644 --- a/LICENSES/HAVEN_LICENSE +++ b/LICENSES/HAVEN_LICENSE @@ -1,2 +1,21 @@ -YEAR: 2013-2016 -COPYRIGHT HOLDER: Hadley Wickham; RStudio; and Evan Miller +# MIT License + +Copyright (c) 2019 Hadley Wickham; RStudio; and Evan Miller + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 5342eda4390eb..33dfbf10ff743 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ # pandas: powerful Python data analysis toolkit [![PyPI Latest Release](https://img.shields.io/pypi/v/pandas.svg)](https://pypi.org/project/pandas/) [![Conda Latest Release](https://anaconda.org/conda-forge/pandas/badges/version.svg)](https://anaconda.org/anaconda/pandas/) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3509134.svg)](https://doi.org/10.5281/zenodo.3509134) [![Package Status](https://img.shields.io/pypi/status/pandas.svg)](https://pypi.org/project/pandas/) [![License](https://img.shields.io/pypi/l/pandas.svg)](https://github.com/pandas-dev/pandas/blob/master/LICENSE) [![Travis Build Status](https://travis-ci.org/pandas-dev/pandas.svg?branch=master)](https://travis-ci.org/pandas-dev/pandas) @@ -158,7 +159,7 @@ Most development discussion is taking place on github in this repo. Further, the All contributions, bug reports, bug fixes, documentation improvements, enhancements and ideas are welcome. -A detailed overview on how to contribute can be found in the **[contributing guide](https://dev.pandas.io/docs/contributing.html)**. There is also an [overview](.github/CONTRIBUTING.md) on GitHub. +A detailed overview on how to contribute can be found in the **[contributing guide](https://pandas.pydata.org/docs/dev/development/contributing.html)**. There is also an [overview](.github/CONTRIBUTING.md) on GitHub. If you are simply looking to start working with the pandas codebase, navigate to the [GitHub "issues" tab](https://github.com/pandas-dev/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pandas-dev/pandas/issues?labels=Docs&sort=updated&state=open) and [good first issue](https://github.com/pandas-dev/pandas/issues?labels=good+first+issue&sort=updated&state=open) where you could start out. diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 7886b63e9983e..7c10a2d17775a 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -39,7 +39,7 @@ // followed by the pip installed packages). "matrix": { "numpy": [], - "Cython": [], + "Cython": ["0.29.16"], "matplotlib": [], "sqlalchemy": [], "scipy": [], diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py index d1e94f62967f4..2745db58e83e3 100644 --- a/asv_bench/benchmarks/arithmetic.py +++ b/asv_bench/benchmarks/arithmetic.py @@ -50,6 +50,53 @@ def time_frame_op_with_scalar(self, dtype, scalar, op): op(self.df, scalar) +class OpWithFillValue: + def setup(self): + # GH#31300 + arr = np.arange(10 ** 6) + df = DataFrame({"A": arr}) + ser = df["A"] + + self.df = df + self.ser = ser + + def time_frame_op_with_fill_value_no_nas(self): + self.df.add(self.df, fill_value=4) + + def time_series_op_with_fill_value_no_nas(self): + self.ser.add(self.ser, fill_value=4) + + +class MixedFrameWithSeriesAxis0: + params = [ + [ + "eq", + "ne", + "lt", + "le", + "ge", + "gt", + "add", + "sub", + "div", + "floordiv", + "mul", + "pow", + ] + ] + param_names = ["opname"] + + def setup(self, opname): + arr = np.arange(10 ** 6).reshape(100, -1) + df = DataFrame(arr) + df["C"] = 1.0 + self.df = df + self.ser = df[0] + + def time_frame_op_with_series_axis0(self, opname): + getattr(self.df, opname)(self.ser, axis=0) + + class Ops: params = [[True, False], ["default", 1]] diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index 8cbf8c8592661..103df0fd94847 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -9,6 +9,11 @@ def setup(self): self.values_float = np.array([1.0, 0.0, 1.0, 0.0]) self.values_integer = np.array([1, 0, 1, 0]) self.values_integer_like = [1, 0, 1, 0] + self.data = np.array([True, False, True, False]) + self.mask = np.array([False, False, True, False]) + + def time_constructor(self): + pd.arrays.BooleanArray(self.data, self.mask) def time_from_bool_array(self): pd.array(self.values_bool, dtype="boolean") @@ -21,3 +26,16 @@ def time_from_integer_like(self): def time_from_float_array(self): pd.array(self.values_float, dtype="boolean") + + +class IntegerArray: + def setup(self): + self.values_integer = np.array([1, 0, 1, 0]) + self.data = np.array([1, 2, 3, 4], dtype="int64") + self.mask = np.array([False, False, True, False]) + + def time_constructor(self): + pd.arrays.IntegerArray(self.data, self.mask) + + def time_from_integer_array(self): + pd.array(self.values_integer, dtype="Int64") diff --git a/asv_bench/benchmarks/finalize.py b/asv_bench/benchmarks/finalize.py new file mode 100644 index 0000000000000..dc06f55cc6ca0 --- /dev/null +++ b/asv_bench/benchmarks/finalize.py @@ -0,0 +1,16 @@ +import pandas as pd + + +class Finalize: + param_names = ["series", "frame"] + params = [pd.Series, pd.DataFrame] + + def setup(self, param): + N = 1000 + obj = param(dtype=float) + for i in range(N): + obj.attrs[i] = i + self.obj = obj + + def time_finalize_micro(self, param): + self.obj.__finalize__(self.obj, method="__finalize__") diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 2b24bab85bc57..dc6f45f810f3d 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -1,5 +1,6 @@ import numpy as np +import pandas as pd from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range from .pandas_vb_common import tm @@ -118,4 +119,48 @@ def time_frame_from_range(self): self.df = DataFrame(self.data) +class FromArrays: + + goal_time = 0.2 + + def setup(self): + N_rows = 1000 + N_cols = 1000 + self.float_arrays = [np.random.randn(N_rows) for _ in range(N_cols)] + self.sparse_arrays = [ + pd.arrays.SparseArray(np.random.randint(0, 2, N_rows), dtype="float64") + for _ in range(N_cols) + ] + self.int_arrays = [ + pd.array(np.random.randint(1000, size=N_rows), dtype="Int64") + for _ in range(N_cols) + ] + self.index = pd.Index(range(N_rows)) + self.columns = pd.Index(range(N_cols)) + + def time_frame_from_arrays_float(self): + self.df = DataFrame._from_arrays( + self.float_arrays, + index=self.index, + columns=self.columns, + verify_integrity=False, + ) + + def time_frame_from_arrays_int(self): + self.df = DataFrame._from_arrays( + self.int_arrays, + index=self.index, + columns=self.columns, + verify_integrity=False, + ) + + def time_frame_from_arrays_sparse(self): + self.df = DataFrame._from_arrays( + self.sparse_arrays, + index=self.index, + columns=self.columns, + verify_integrity=False, + ) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 2187668c96ca4..a3aff45afa116 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -619,4 +619,17 @@ def time_select_dtypes(self, n): self.df.select_dtypes(include="int") +class MemoryUsage: + def setup(self): + self.df = DataFrame(np.random.randn(100000, 2), columns=list("AB")) + self.df2 = self.df.copy() + self.df2["A"] = self.df2["A"].astype("object") + + def time_memory_usage(self): + self.df.memory_usage(deep=True) + + def time_memory_usage_object_dtype(self): + self.df2.memory_usage(deep=True) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index e98d2948e76ea..7f3c24d5a2732 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -308,6 +308,31 @@ def time_frame_getitem_single_column_int(self): self.df_int_col[0] +class IndexSingleRow: + params = [True, False] + param_names = ["unique_cols"] + + def setup(self, unique_cols): + arr = np.arange(10 ** 7).reshape(-1, 10) + df = DataFrame(arr) + dtypes = ["u1", "u2", "u4", "u8", "i1", "i2", "i4", "i8", "f8", "f4"] + for i, d in enumerate(dtypes): + df[i] = df[i].astype(d) + + if not unique_cols: + # GH#33032 single-row lookups with non-unique columns were + # 15x slower than with unique columns + df.columns = ["A", "A"] + list(df.columns[2:]) + + self.df = df + + def time_iloc_row(self, unique_cols): + self.df.iloc[10000] + + def time_loc_row(self, unique_cols): + self.df.loc[10000] + + class AssignTimeseriesIndex: def setup(self): N = 100000 diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 793f0c7c03c77..18dbb7eae0615 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -74,10 +74,38 @@ def setup(self): ], dtype=object, ) + self.other_mi_many_mismatches = MultiIndex.from_tuples( + [ + (-7, 41), + (-2, 3), + (-0.7, 5), + (0, 0), + (0, 1.5), + (0, 340), + (0, 1001), + (1, -4), + (1, 20), + (1, 1040), + (432, -5), + (432, 17), + (439, 165.5), + (998, -4), + (998, 24065), + (999, 865.2), + (999, 1000), + (1045, -843), + ] + ) def time_get_indexer(self): self.mi_int.get_indexer(self.obj_index) + def time_get_indexer_and_backfill(self): + self.mi_int.get_indexer(self.other_mi_many_mismatches, method="backfill") + + def time_get_indexer_and_pad(self): + self.mi_int.get_indexer(self.other_mi_many_mismatches, method="pad") + def time_is_monotonic(self): self.mi_int.is_monotonic diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index f7e1e395a76bc..f85dc83ab8605 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -11,7 +11,7 @@ class Methods: ["int", "float"], ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"], ) - param_names = ["contructor", "window", "dtype", "method"] + param_names = ["constructor", "window", "dtype", "method"] def setup(self, constructor, window, dtype, method): N = 10 ** 5 @@ -72,7 +72,7 @@ class ExpandingMethods: ["int", "float"], ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"], ) - param_names = ["contructor", "window", "dtype", "method"] + param_names = ["constructor", "window", "dtype", "method"] def setup(self, constructor, dtype, method): N = 10 ** 5 @@ -86,7 +86,7 @@ def time_expanding(self, constructor, dtype, method): class EWMMethods: params = (["DataFrame", "Series"], [10, 1000], ["int", "float"], ["mean", "std"]) - param_names = ["contructor", "window", "dtype", "method"] + param_names = ["constructor", "window", "dtype", "method"] def setup(self, constructor, window, dtype, method): N = 10 ** 5 @@ -104,7 +104,7 @@ class VariableWindowMethods(Methods): ["int", "float"], ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"], ) - param_names = ["contructor", "window", "dtype", "method"] + param_names = ["constructor", "window", "dtype", "method"] def setup(self, constructor, window, dtype, method): N = 10 ** 5 @@ -165,4 +165,26 @@ def peakmem_fixed(self): self.roll.max() +class ForwardWindowMethods: + params = ( + ["DataFrame", "Series"], + [10, 1000], + ["int", "float"], + ["median", "mean", "max", "min", "kurt", "sum"], + ) + param_names = ["constructor", "window_size", "dtype", "method"] + + def setup(self, constructor, window_size, dtype, method): + N = 10 ** 5 + arr = np.random.random(N).astype(dtype) + indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=window_size) + self.roll = getattr(pd, constructor)(arr).rolling(window=indexer) + + def time_rolling(self, constructor, window_size, dtype, method): + getattr(self.roll, method)() + + def peakmem_rolling(self, constructor, window_size, dtype, method): + getattr(self.roll, method)() + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 57c625ced8a43..d78419c12ce0d 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -223,27 +223,27 @@ def time_series_datetimeindex_repr(self): class All: - params = [[10 ** 3, 10 ** 6], ["fast", "slow"]] - param_names = ["N", "case"] + params = [[10 ** 3, 10 ** 6], ["fast", "slow"], ["bool", "boolean"]] + param_names = ["N", "case", "dtype"] - def setup(self, N, case): + def setup(self, N, case, dtype): val = case != "fast" - self.s = Series([val] * N) + self.s = Series([val] * N, dtype=dtype) - def time_all(self, N, case): + def time_all(self, N, case, dtype): self.s.all() class Any: - params = [[10 ** 3, 10 ** 6], ["fast", "slow"]] - param_names = ["N", "case"] + params = [[10 ** 3, 10 ** 6], ["fast", "slow"], ["bool", "boolean"]] + param_names = ["N", "case", "dtype"] - def setup(self, N, case): + def setup(self, N, case, dtype): val = case == "fast" - self.s = Series([val] * N) + self.s = Series([val] * N, dtype=dtype) - def time_any(self, N, case): + def time_any(self, N, case, dtype): self.s.any() @@ -265,11 +265,14 @@ class NanOps: "prod", ], [10 ** 3, 10 ** 6], - ["int8", "int32", "int64", "float64"], + ["int8", "int32", "int64", "float64", "Int64", "boolean"], ] param_names = ["func", "N", "dtype"] def setup(self, func, N, dtype): + if func == "argmax" and dtype in {"Int64", "boolean"}: + # Skip argmax for nullable int since this doesn't work yet (GH-24382) + raise NotImplementedError self.s = Series([1] * N, dtype=dtype) self.func = getattr(self.s, func) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index ac78ca53679fd..d6aa41a7e0f32 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -2,7 +2,8 @@ import scipy.sparse import pandas as pd -from pandas import MultiIndex, Series, SparseArray, date_range +from pandas import MultiIndex, Series, date_range +from pandas.arrays import SparseArray def make_array(size, dense_proportion, fill_value, dtype): @@ -45,7 +46,6 @@ def time_sparse_array(self, dense_proportion, fill_value, dtype): class SparseDataFrameConstructor: def setup(self): N = 1000 - self.arr = np.arange(N) self.sparse = scipy.sparse.rand(N, N, 0.005) def time_from_scipy(self): diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index ec67394e55a1e..ebbd3c9eddfdb 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -7,11 +7,17 @@ class FrameOps: - params = [ops, ["float", "int"], [0, 1]] + params = [ops, ["float", "int", "Int64"], [0, 1]] param_names = ["op", "dtype", "axis"] def setup(self, op, dtype, axis): - df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype) + if op == "mad" and dtype == "Int64" and axis == 1: + # GH-33036 + raise NotImplementedError + values = np.random.randn(100000, 4) + if dtype == "Int64": + values = values.astype(int) + df = pd.DataFrame(values).astype(dtype) self.df_func = getattr(df, op) def time_op(self, op, dtype, axis): diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 6c9f8ee77e5ad..b494dbd8a38fa 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -336,15 +336,33 @@ def time_infer_quarter(self): class ToDatetimeFormat: def setup(self): - self.s = Series(["19MAY11", "19MAY11:00:00:00"] * 100000) + N = 100000 + self.s = Series(["19MAY11", "19MAY11:00:00:00"] * N) self.s2 = self.s.str.replace(":\\S+$", "") + self.same_offset = ["10/11/2018 00:00:00.045-07:00"] * N + self.diff_offset = [ + f"10/11/2018 00:00:00.045-0{offset}:00" for offset in range(10) + ] * int(N / 10) + def time_exact(self): to_datetime(self.s2, format="%d%b%y") def time_no_exact(self): to_datetime(self.s, format="%d%b%y", exact=False) + def time_same_offset(self): + to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z") + + def time_different_offset(self): + to_datetime(self.diff_offset, format="%m/%d/%Y %H:%M:%S.%f%z") + + def time_same_offset_to_utc(self): + to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True) + + def time_different_offset_to_utc(self): + to_datetime(self.diff_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True) + class ToDatetimeCache: diff --git a/azure-pipelines.yml b/azure-pipelines.yml index d992c64073476..d042bda77d4e8 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -1,4 +1,10 @@ # Adapted from https://github.com/numba/numba/blob/master/azure-pipelines.yml +trigger: +- master + +pr: +- master + jobs: # Mac and Linux use the same template - template: ci/azure/posix.yml @@ -15,78 +21,3 @@ jobs: parameters: name: Windows vmImage: vs2017-win2016 - -- job: 'Web_and_Docs' - pool: - vmImage: ubuntu-16.04 - timeoutInMinutes: 90 - steps: - - script: | - echo '##vso[task.setvariable variable=ENV_FILE]environment.yml' - echo '##vso[task.prependpath]$(HOME)/miniconda3/bin' - displayName: 'Setting environment variables' - - - script: | - sudo apt-get install -y libc6-dev-i386 - ci/setup_env.sh - displayName: 'Setup environment and build pandas' - - - script: | - source activate pandas-dev - python web/pandas_web.py web/pandas --target-path=web/build - displayName: 'Build website' - - - script: | - source activate pandas-dev - # Next we should simply have `doc/make.py --warnings-are-errors`, everything else is required because the ipython directive doesn't fail the build on errors (https://github.com/ipython/ipython/issues/11547) - doc/make.py --warnings-are-errors | tee sphinx.log ; SPHINX_RET=${PIPESTATUS[0]} - grep -B1 "^<<<-------------------------------------------------------------------------$" sphinx.log ; IPY_RET=$(( $? != 1 )) - exit $(( $SPHINX_RET + $IPY_RET )) - displayName: 'Build documentation' - - - script: | - mkdir -p to_deploy/docs - cp -r web/build/* to_deploy/ - cp -r doc/build/html/* to_deploy/docs/ - displayName: 'Merge website and docs' - - - script: | - cd to_deploy - git init - touch .nojekyll - echo "dev.pandas.io" > CNAME - printf "User-agent: *\nDisallow: /" > robots.txt - git add --all . - git config user.email "pandas-dev@python.org" - git config user.name "pandas-bot" - git commit -m "pandas web and documentation in master" - displayName: 'Create git repo for docs build' - condition : | - and(not(eq(variables['Build.Reason'], 'PullRequest')), - eq(variables['Build.SourceBranch'], 'refs/heads/master')) - - # For `InstallSSHKey@0` to work, next steps are required: - # 1. Generate a pair of private/public keys (i.e. `ssh-keygen -t rsa -b 4096 -C "your_email@example.com"`) - # 2. Go to "Library > Secure files" in the Azure Pipelines dashboard: https://dev.azure.com/pandas-dev/pandas/_library?itemType=SecureFiles - # 3. Click on "+ Secure file" - # 4. Upload the private key (the name of the file must match with the specified in "sshKeySecureFile" input below, "pandas_docs_key") - # 5. Click on file name after it is created, tick the box "Authorize for use in all pipelines" and save - # 6. The public key specified in "sshPublicKey" is the pair of the uploaded private key, and needs to be set as a deploy key of the repo where the docs will be pushed (with write access): https://github.com/pandas-dev/pandas-dev.github.io/settings/keys - - task: InstallSSHKey@0 - inputs: - hostName: 'github.com,192.30.252.128 ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAq2A7hRGmdnm9tUDbO9IDSwBK6TbQa+PXYPCPy6rbTrTtw7PHkccKrpp0yVhp5HdEIcKr6pLlVDBfOLX9QUsyCOV0wzfjIJNlGEYsdlLJizHhbn2mUjvSAHQqZETYP81eFzLQNnPHt4EVVUh7VfDESU84KezmD5QlWpXLmvU31/yMf+Se8xhHTvKSCZIFImWwoG6mbUoWf9nzpIoaSjB+weqqUUmpaaasXVal72J+UX2B+2RPW3RcT0eOzQgqlJL3RKrTJvdsjE3JEAvGq3lGHSZXy28G3skua2SmVi/w4yCE6gbODqnTWlg7+wC604ydGXA8VJiS5ap43JXiUFFAaQ==' - sshPublicKey: 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDHmz3l/EdqrgNxEUKkwDUuUcLv91unig03pYFGO/DMIgCmPdMG96zAgfnESd837Rm0wSSqylwSzkRJt5MV/TpFlcVifDLDQmUhqCeO8Z6dLl/oe35UKmyYICVwcvQTAaHNnYRpKC5IUlTh0JEtw9fGlnp1Ta7U1ENBLbKdpywczElhZu+hOQ892zqOj3CwA+U2329/d6cd7YnqIKoFN9DWT3kS5K6JE4IoBfQEVekIOs23bKjNLvPoOmi6CroAhu/K8j+NCWQjge5eJf2x/yTnIIP1PlEcXoHIr8io517posIx3TBup+CN8bNS1PpDW3jyD3ttl1uoBudjOQrobNnJeR6Rn67DRkG6IhSwr3BWj8alwUG5mTdZzwV5Pa9KZFdIiqX7NoDGg+itsR39QCn0thK8lGRNSR8KrWC1PSjecwelKBO7uQ7rnk/rkrZdBWR4oEA8YgNH8tirUw5WfOr5a0AIaJicKxGKNdMxZt+zmC+bS7F4YCOGIm9KHa43RrKhoGRhRf9fHHHKUPwFGqtWG4ykcUgoamDOURJyepesBAO3FiRE9rLU6ILbB3yEqqoekborHmAJD5vf7PWItW3Q/YQKuk3kkqRcKnexPyzyyq5lUgTi8CxxZdaASIOu294wjBhhdyHlXEkVTNJ9JKkj/obF+XiIIp0cBDsOXY9hDQ== pandas-dev@python.org' - sshKeySecureFile: 'pandas_docs_key' - displayName: 'Install GitHub ssh deployment key' - condition : | - and(not(eq(variables['Build.Reason'], 'PullRequest')), - eq(variables['Build.SourceBranch'], 'refs/heads/master')) - - - script: | - cd to_deploy - git remote add origin git@github.com:pandas-dev/pandas-dev.github.io.git - git push -f origin master - displayName: 'Publish web and docs to GitHub pages' - condition : | - and(not(eq(variables['Build.Reason'], 'PullRequest')), - eq(variables['Build.SourceBranch'], 'refs/heads/master')) diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml index c9a2e4eefd19d..880fdc46f43f5 100644 --- a/ci/azure/posix.yml +++ b/ci/azure/posix.yml @@ -24,7 +24,7 @@ jobs: ENV_FILE: ci/deps/azure-36-locale_slow.yaml CONDA_PY: "36" PATTERN: "slow" - # pandas does not use the language (zh_CN), but should support diferent encodings (utf8) + # pandas does not use the language (zh_CN), but should support different encodings (utf8) # we should test with encodings different than utf8, but doesn't seem like Ubuntu supports any LANG: "zh_CN.utf8" LC_ALL: "zh_CN.utf8" @@ -38,11 +38,11 @@ jobs: LC_ALL: "it_IT.utf8" EXTRA_APT: "language-pack-it xsel" - py36_32bit: - ENV_FILE: ci/deps/azure-36-32bit.yaml - CONDA_PY: "36" - PATTERN: "not slow and not network and not clipboard" - BITS32: "yes" + #py36_32bit: + # ENV_FILE: ci/deps/azure-36-32bit.yaml + # CONDA_PY: "36" + # PATTERN: "not slow and not network and not clipboard" + # BITS32: "yes" py37_locale: ENV_FILE: ci/deps/azure-37-locale.yaml diff --git a/ci/code_checks.sh b/ci/code_checks.sh index e2dc543360a62..45b7db74fa409 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -102,9 +102,17 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then MSG='Check for use of not concatenated strings' ; echo $MSG if [[ "$GITHUB_ACTIONS" == "true" ]]; then - $BASE_DIR/scripts/validate_string_concatenation.py --format="[error]{source_path}:{line_number}:{msg}" . + $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="strings_to_concatenate" --format="##[error]{source_path}:{line_number}:{msg}" . else - $BASE_DIR/scripts/validate_string_concatenation.py . + $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="strings_to_concatenate" . + fi + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Check for strings with wrong placed spaces' ; echo $MSG + if [[ "$GITHUB_ACTIONS" == "true" ]]; then + $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="strings_with_wrong_placed_whitespace" --format="##[error]{source_path}:{line_number}:{msg}" . + else + $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="strings_with_wrong_placed_whitespace" . fi RET=$(($RET + $?)) ; echo $MSG "DONE" @@ -113,7 +121,7 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then # Imports - Check formatting using isort see setup.cfg for settings MSG='Check import format using isort' ; echo $MSG - ISORT_CMD="isort --quiet --recursive --check-only pandas asv_bench" + ISORT_CMD="isort --quiet --recursive --check-only pandas asv_bench scripts" if [[ "$GITHUB_ACTIONS" == "true" ]]; then eval $ISORT_CMD | awk '{print "##[error]" $0}'; RET=$(($RET + ${PIPESTATUS[0]})) else @@ -258,57 +266,80 @@ fi ### DOCTESTS ### if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then - MSG='Doctests frame.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/frame.py + # Individual files + + MSG='Doctests accessor.py' ; echo $MSG + pytest -q --doctest-modules pandas/core/accessor.py RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Doctests series.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/series.py \ - -k"-nonzero -reindex -searchsorted -to_dict" + MSG='Doctests aggregation.py' ; echo $MSG + pytest -q --doctest-modules pandas/core/aggregation.py RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Doctests generic.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/generic.py \ - -k"-_set_axis_name -_xs -describe -groupby -interpolate -pct_change -pipe -reindex -reindex_axis -to_json -transpose -values -xs -to_clipboard" + MSG='Doctests base.py' ; echo $MSG + pytest -q --doctest-modules pandas/core/base.py + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Doctests construction.py' ; echo $MSG + pytest -q --doctest-modules pandas/core/construction.py RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Doctests groupby.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/groupby/groupby.py -k"-cumcount -describe -pipe" + MSG='Doctests frame.py' ; echo $MSG + pytest -q --doctest-modules pandas/core/frame.py RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Doctests datetimes.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/tools/datetimes.py + MSG='Doctests generic.py' ; echo $MSG + pytest -q --doctest-modules pandas/core/generic.py RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Doctests top-level reshaping functions' ; echo $MSG - pytest -q --doctest-modules \ - pandas/core/reshape/concat.py \ - pandas/core/reshape/pivot.py \ - pandas/core/reshape/reshape.py \ - pandas/core/reshape/tile.py \ - pandas/core/reshape/melt.py \ - -k"-crosstab -pivot_table -cut" + MSG='Doctests series.py' ; echo $MSG + pytest -q --doctest-modules pandas/core/series.py RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Doctests interval classes' ; echo $MSG - pytest -q --doctest-modules \ - pandas/core/indexes/interval.py \ - pandas/core/arrays/interval.py + MSG='Doctests strings.py' ; echo $MSG + pytest -q --doctest-modules pandas/core/strings.py RET=$(($RET + $?)) ; echo $MSG "DONE" + # Directories + MSG='Doctests arrays'; echo $MSG - pytest -q --doctest-modules \ - pandas/core/arrays/string_.py \ - pandas/core/arrays/integer.py \ - pandas/core/arrays/boolean.py + pytest -q --doctest-modules pandas/core/arrays/ + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Doctests computation' ; echo $MSG + pytest -q --doctest-modules pandas/core/computation/ RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Doctests dtypes'; echo $MSG pytest -q --doctest-modules pandas/core/dtypes/ RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Doctests arrays/boolean.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/arrays/boolean.py + MSG='Doctests groupby' ; echo $MSG + pytest -q --doctest-modules pandas/core/groupby/ + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Doctests indexes' ; echo $MSG + pytest -q --doctest-modules pandas/core/indexes/ + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Doctests ops' ; echo $MSG + pytest -q --doctest-modules pandas/core/ops/ + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Doctests reshape' ; echo $MSG + pytest -q --doctest-modules pandas/core/reshape/ + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Doctests tools' ; echo $MSG + pytest -q --doctest-modules pandas/core/tools/ + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Doctests window' ; echo $MSG + pytest -q --doctest-modules pandas/core/window/ + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Doctests tseries' ; echo $MSG + pytest -q --doctest-modules pandas/tseries/ RET=$(($RET + $?)) ; echo $MSG "DONE" fi @@ -320,6 +351,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA02,SA03,SA05 RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Validate correct capitalization among titles in documentation' ; echo $MSG + $BASE_DIR/scripts/validate_rst_title_capitalization.py $BASE_DIR/doc/source/development $BASE_DIR/doc/source/reference + RET=$(($RET + $?)) ; echo $MSG "DONE" + fi ### DEPENDENCIES ### diff --git a/ci/deps/azure-36-32bit.yaml b/ci/deps/azure-36-32bit.yaml index cf3fca307481f..15704cf0d5427 100644 --- a/ci/deps/azure-36-32bit.yaml +++ b/ci/deps/azure-36-32bit.yaml @@ -22,5 +22,5 @@ dependencies: # see comment above - pip - pip: - - cython>=0.29.13 + - cython>=0.29.16 - pytest>=5.0.1 diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index 810554632a507..56da56b45b702 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -6,7 +6,7 @@ dependencies: - python=3.6.* # tools - - cython>=0.29.13 + - cython>=0.29.16 - pytest>=5.0.1 - pytest-xdist>=1.21 - pytest-asyncio diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-36-locale_slow.yaml index 48ac50c001715..c086b3651afc3 100644 --- a/ci/deps/azure-36-locale_slow.yaml +++ b/ci/deps/azure-36-locale_slow.yaml @@ -6,7 +6,7 @@ dependencies: - python=3.6.* # tools - - cython>=0.29.13 + - cython>=0.29.16 - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/azure-36-minimum_versions.yaml b/ci/deps/azure-36-minimum_versions.yaml index de7e011d9c7ca..e553330b962a2 100644 --- a/ci/deps/azure-36-minimum_versions.yaml +++ b/ci/deps/azure-36-minimum_versions.yaml @@ -6,7 +6,7 @@ dependencies: - python=3.6.1 # tools - - cython=0.29.13 + - cython=0.29.16 - pytest=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 @@ -22,7 +22,7 @@ dependencies: - numpy=1.13.3 - openpyxl=2.5.7 - pytables=3.4.2 - - python-dateutil=2.6.1 + - python-dateutil=2.7.3 - pytz=2017.2 - scipy=0.19.0 - xlrd=1.1.0 diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index dc51597a33209..31155ac93931a 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -5,7 +5,7 @@ dependencies: - python=3.7.* # tools - - cython>=0.29.13 + - cython>=0.29.16 - pytest>=5.0.1 - pytest-xdist>=1.21 - pytest-asyncio diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml index a04bdc2448bce..29ebfe2639e32 100644 --- a/ci/deps/azure-37-numpydev.yaml +++ b/ci/deps/azure-37-numpydev.yaml @@ -5,7 +5,6 @@ dependencies: - python=3.7.* # tools - - cython>=0.29.13 - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 @@ -15,6 +14,7 @@ dependencies: - pytz - pip - pip: + - cython>=0.29.16 - "git+git://github.com/dateutil/dateutil.git" - "-f https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com" - "--pre" diff --git a/ci/deps/azure-macos-36.yaml b/ci/deps/azure-macos-36.yaml index 90980133b31c1..93885afbc4114 100644 --- a/ci/deps/azure-macos-36.yaml +++ b/ci/deps/azure-macos-36.yaml @@ -5,7 +5,6 @@ dependencies: - python=3.6.* # tools - - cython>=0.29.13 - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 @@ -24,7 +23,7 @@ dependencies: - openpyxl - pyarrow>=0.13.0 - pytables - - python-dateutil==2.6.1 + - python-dateutil==2.7.3 - pytz - xarray - xlrd @@ -32,5 +31,6 @@ dependencies: - xlwt - pip - pip: + - cython>=0.29.16 - pyreadstat - pyxlsb diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml index 663c55492e69e..548660cabaa67 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -6,7 +6,7 @@ dependencies: - python=3.6.* # tools - - cython>=0.29.13 + - cython>=0.29.16 - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 6b3ad6f560292..e491fd57b240b 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -6,7 +6,7 @@ dependencies: - python=3.7.* # tools - - cython>=0.29.13 + - cython>=0.29.16 - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index 6883301a63a9b..2968c8f188d49 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -6,7 +6,7 @@ dependencies: - python=3.6.* # tools - - cython>=0.29.13 + - cython>=0.29.16 - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 @@ -15,7 +15,7 @@ dependencies: # pandas dependencies - beautifulsoup4 - botocore>=1.11 - - cython>=0.29.13 + - cython>=0.29.16 - dask - fastparquet>=0.3.2 - gcsfs diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml index d0bc046575953..3fc19f1bca084 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-36-locale.yaml @@ -6,7 +6,7 @@ dependencies: - python=3.6.* # tools - - cython>=0.29.13 + - cython>=0.29.16 - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/travis-36-slow.yaml b/ci/deps/travis-36-slow.yaml index 1dfd90d0904ac..df693f0e22c71 100644 --- a/ci/deps/travis-36-slow.yaml +++ b/ci/deps/travis-36-slow.yaml @@ -6,7 +6,7 @@ dependencies: - python=3.6.* # tools - - cython>=0.29.13 + - cython>=0.29.16 - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index 682b1016ff3a2..986728d0a4a40 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -6,7 +6,7 @@ dependencies: - python=3.7.* # tools - - cython>=0.29.13 + - cython>=0.29.16 - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/travis-38.yaml b/ci/deps/travis-38.yaml index a627b7edc175f..b879c0f81dab2 100644 --- a/ci/deps/travis-38.yaml +++ b/ci/deps/travis-38.yaml @@ -6,7 +6,7 @@ dependencies: - python=3.8.* # tools - - cython>=0.29.13 + - cython>=0.29.16 - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/doc/make.py b/doc/make.py index 024a748cd28ca..db729853e5834 100755 --- a/doc/make.py +++ b/doc/make.py @@ -83,8 +83,8 @@ def _process_single_doc(self, single_doc): obj = pandas # noqa: F821 for name in single_doc.split("."): obj = getattr(obj, name) - except AttributeError: - raise ImportError(f"Could not import {single_doc}") + except AttributeError as err: + raise ImportError(f"Could not import {single_doc}") from err else: return single_doc[len("pandas.") :] else: diff --git a/doc/redirects.csv b/doc/redirects.csv index ef93955c14fe6..b59ccf649ee21 100644 --- a/doc/redirects.csv +++ b/doc/redirects.csv @@ -7,13 +7,10 @@ release,whatsnew/index # getting started install,getting_started/install -10min,getting_started/10min -basics,getting_started/basics comparison_with_r,getting_started/comparison/comparison_with_r comparison_with_sql,getting_started/comparison/comparison_with_sql comparison_with_sas,getting_started/comparison/comparison_with_sas comparison_with_stata,getting_started/comparison/comparison_with_stata -dsintro,getting_started/dsintro overview,getting_started/overview tutorials,getting_started/tutorials @@ -38,6 +35,9 @@ text,user_guide/text timedeltas,user_guide/timedeltas timeseries,user_guide/timeseries visualization,user_guide/visualization +10min,user_guide/10min +basics,user_guide/basics +dsintro,user_guide/dsintro # development contributing,development/contributing @@ -49,7 +49,25 @@ internals,development/internals # api moved function reference/api/pandas.io.json.json_normalize,pandas.json_normalize -# api rename +# rename due to refactors +reference/api/pandas.core.window.Rolling,pandas.core.window.rolling.Rolling +reference/api/pandas.core.window.Rolling.aggregate,pandas.core.window.rolling.Rolling.aggregate +reference/api/pandas.core.window.Rolling.apply,pandas.core.window.rolling.Rolling.apply +reference/api/pandas.core.window.Rolling.corr,pandas.core.window.rolling.Rolling.corr +reference/api/pandas.core.window.Rolling.count,pandas.core.window.rolling.Rolling.count +reference/api/pandas.core.window.Rolling.cov,pandas.core.window.rolling.Rolling.cov +reference/api/pandas.core.window.Rolling.kurt,pandas.core.window.rolling.Rolling.kurt +reference/api/pandas.core.window.Rolling.max,pandas.core.window.rolling.Rolling.max +reference/api/pandas.core.window.Rolling.mean,pandas.core.window.rolling.Rolling.mean +reference/api/pandas.core.window.Rolling.median,pandas.core.window.rolling.Rolling.median +reference/api/pandas.core.window.Rolling.min,pandas.core.window.rolling.Rolling.min +reference/api/pandas.core.window.Rolling.quantile,pandas.core.window.rolling.Rolling.quantile +reference/api/pandas.core.window.Rolling.skew,pandas.core.window.rolling.Rolling.skew +reference/api/pandas.core.window.Rolling.std,pandas.core.window.rolling.Rolling.std +reference/api/pandas.core.window.Rolling.sum,pandas.core.window.rolling.Rolling.sum +reference/api/pandas.core.window.Rolling.var,pandas.core.window.rolling.Rolling.var + +# api url change (generated -> reference/api rename) api,reference/index generated/pandas.api.extensions.ExtensionArray.argsort,../reference/api/pandas.api.extensions.ExtensionArray.argsort generated/pandas.api.extensions.ExtensionArray.astype,../reference/api/pandas.api.extensions.ExtensionArray.astype diff --git a/doc/source/_static/question_mark_noback.svg b/doc/source/_static/question_mark_noback.svg new file mode 100644 index 0000000000000..3abb4b806d20a --- /dev/null +++ b/doc/source/_static/question_mark_noback.svg @@ -0,0 +1,72 @@ + + + + + + + + + + image/svg+xml + + + + + + + ? + + diff --git a/doc/source/conf.py b/doc/source/conf.py index a95cd4ab696f7..d2404b757ca11 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -109,6 +109,7 @@ ) ) autosummary_generate = True if pattern is None else ["index"] +autodoc_typehints = "none" # numpydoc numpydoc_attributes_as_param_list = False @@ -195,7 +196,7 @@ # The theme to use for HTML and HTML Help pages. Major themes that come with # Sphinx are currently 'default' and 'sphinxdoc'. -html_theme = "pandas_sphinx_theme" +html_theme = "pydata_sphinx_theme" # The style sheet to use for HTML and HTML Help pages. A file of that name # must exist either in Sphinx' static/ path, or in one of the custom paths @@ -415,6 +416,7 @@ "python": ("https://docs.python.org/3/", None), "scipy": ("https://docs.scipy.org/doc/scipy/reference/", None), "statsmodels": ("https://www.statsmodels.org/devel/", None), + "pyarrow": ("https://arrow.apache.org/docs/", None), } # extlinks alias diff --git a/doc/source/development/code_style.rst b/doc/source/development/code_style.rst index 17f8783f71bfb..6d33537a40175 100644 --- a/doc/source/development/code_style.rst +++ b/doc/source/development/code_style.rst @@ -18,12 +18,12 @@ consistent code format throughout the project. For details see the Patterns ======== -foo.__class__ -------------- +Using foo.__class__ +------------------- -*pandas* uses 'type(foo)' instead 'foo.__class__' as it is making the code more -readable. +pandas uses 'type(foo)' instead 'foo.__class__' as it is making the code more +readable. For example: **Good:** @@ -47,13 +47,13 @@ String formatting Concatenated strings -------------------- -f-strings -~~~~~~~~~ +Using f-strings +~~~~~~~~~~~~~~~ -*pandas* uses f-strings formatting instead of '%' and '.format()' string formatters. +pandas uses f-strings formatting instead of '%' and '.format()' string formatters. -The convention of using f-strings on a string that is concatenated over serveral lines, -is to prefix only the lines containing the value needs to be interpeted. +The convention of using f-strings on a string that is concatenated over several lines, +is to prefix only the lines containing values which need to be interpreted. For example: @@ -86,8 +86,8 @@ For example: White spaces ~~~~~~~~~~~~ -Putting the white space only at the end of the previous line, so -there is no whitespace at the beggining of the concatenated string. +Only put white space at the end of the previous line, so +there is no whitespace at the beginning of the concatenated string. For example: @@ -114,9 +114,9 @@ For example: Representation function (aka 'repr()') -------------------------------------- -*pandas* uses 'repr()' instead of '%r' and '!r'. +pandas uses 'repr()' instead of '%r' and '!r'. -The use of 'repr()' will only happend when the value is not an obvious string. +The use of 'repr()' will only happen when the value is not an obvious string. For example: @@ -138,7 +138,7 @@ For example: Imports (aim for absolute) ========================== -In Python 3, absolute imports are recommended. In absolute import doing something +In Python 3, absolute imports are recommended. Using absolute imports, doing something like ``import string`` will import the string module rather than ``string.py`` in the same directory. As much as possible, you should try to write out absolute imports that show the whole import chain from top-level pandas. diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index f904781178656..ba7f7eb907f4a 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -53,7 +53,7 @@ Feel free to ask questions on the `mailing list Bug reports and enhancement requests ==================================== -Bug reports are an important part of making *pandas* more stable. Having a complete bug report +Bug reports are an important part of making pandas more stable. Having a complete bug report will allow others to reproduce the bug and provide insight into fixing. See `this stackoverflow article `_ and `this blogpost `_ @@ -75,14 +75,14 @@ Bug reports must: ... ``` -#. Include the full version string of *pandas* and its dependencies. You can use the built-in function:: +#. Include the full version string of pandas and its dependencies. You can use the built-in function:: >>> import pandas as pd >>> pd.show_versions() #. Explain why the current behavior is wrong/not desired and what you expect instead. -The issue will then show up to the *pandas* community and be open to comments/ideas from others. +The issue will then show up to the pandas community and be open to comments/ideas from others. .. _contributing.github: @@ -90,14 +90,14 @@ Working with the code ===================== Now that you have an issue you want to fix, enhancement to add, or documentation to improve, -you need to learn how to work with GitHub and the *pandas* code base. +you need to learn how to work with GitHub and the pandas code base. .. _contributing.version_control: Version control, Git, and GitHub -------------------------------- -To the new user, working with Git is one of the more daunting aspects of contributing to *pandas*. +To the new user, working with Git is one of the more daunting aspects of contributing to pandas. It can very quickly become overwhelming, but sticking to the guidelines below will help keep the process straightforward and mostly trouble free. As always, if you are having difficulties please feel free to ask for help. @@ -146,7 +146,7 @@ requires a C compiler and Python environment. If you're making documentation changes, you can skip to :ref:`contributing.documentation` but you won't be able to build the documentation locally before pushing your changes. -Using a Docker Container +Using a Docker container ~~~~~~~~~~~~~~~~~~~~~~~~ Instead of manually setting up a development environment, you can use Docker to @@ -221,7 +221,7 @@ environment: `_ * Make sure your conda is up to date (``conda update conda``) * Make sure that you have :ref:`cloned the repository ` -* ``cd`` to the *pandas* source directory +* ``cd`` to the pandas source directory We'll now kick off a three-step process: @@ -295,7 +295,7 @@ Below is a brief overview on how to set-up a virtual environment with Powershell under Windows. For details please refer to the `official virtualenv user guide `__ -Use an ENV_DIR of your choice. We'll use ~\virtualenvs\pandas-dev where +Use an ENV_DIR of your choice. We'll use ~\\virtualenvs\\pandas-dev where '~' is the folder pointed to by either $env:USERPROFILE (Powershell) or %USERPROFILE% (cmd.exe) environment variable. Any parent directories should already exist. @@ -330,7 +330,7 @@ The above can be simplified to:: This changes your working directory to the shiny-new-feature branch. Keep any changes in this branch specific to one bug or feature so it is clear -what the branch brings to *pandas*. You can have many shiny-new-features +what the branch brings to pandas. You can have many shiny-new-features and switch in between them using the git checkout command. When creating this branch, make sure your master branch is up to date with @@ -349,9 +349,9 @@ you created the branch, check the section on Contributing to the documentation ================================= -Contributing to the documentation benefits everyone who uses *pandas*. +Contributing to the documentation benefits everyone who uses pandas. We encourage you to help us improve the documentation, and -you don't have to be an expert on *pandas* to do so! In fact, +you don't have to be an expert on pandas to do so! In fact, there are sections of the docs that are worse off after being written by experts. If something in the docs doesn't make sense to you, updating the relevant section after you figure it out is a great way to ensure it will help @@ -361,7 +361,7 @@ the next person. :local: -About the *pandas* documentation +About the pandas documentation -------------------------------- The documentation is written in **reStructuredText**, which is almost like writing @@ -372,7 +372,7 @@ complex changes to the documentation as well. Some other important things to know about the docs: -* The *pandas* documentation consists of two parts: the docstrings in the code +* The pandas documentation consists of two parts: the docstrings in the code itself and the docs in this folder ``doc/``. The docstrings provide a clear explanation of the usage of the individual @@ -452,7 +452,7 @@ This will identify methods documented in ``doc/source/reference`` that are not a class methods, and existing methods that are not documented in ``doc/source/reference``. -Updating a *pandas* docstring +Updating a pandas docstring ----------------------------- When improving a single function or method's docstring, it is not necessarily @@ -477,7 +477,7 @@ When doing a PR with a docstring update, it is good to post the output of the validation script in a comment on github. -How to build the *pandas* documentation +How to build the pandas documentation --------------------------------------- Requirements @@ -543,7 +543,7 @@ And you'll have the satisfaction of seeing your new and improved documentation! Building master branch documentation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -When pull requests are merged into the *pandas* ``master`` branch, the main parts of +When pull requests are merged into the pandas ``master`` branch, the main parts of the documentation are also built by Travis-CI. These docs are then hosted `here `__, see also the :ref:`Continuous Integration ` section. @@ -563,7 +563,7 @@ Writing good code is not just about what you write. It is also about *how* you write it. During :ref:`Continuous Integration ` testing, several tools will be run to check your code for stylistic errors. Generating any warnings will cause the test to fail. -Thus, good style is a requirement for submitting code to *pandas*. +Thus, good style is a requirement for submitting code to pandas. There is a tool in pandas to help contributors verify their changes before contributing them to the project:: @@ -601,7 +601,7 @@ set in the ``pandas.compat._optional.VERSIONS`` dict. C (cpplint) ~~~~~~~~~~~ -*pandas* uses the `Google `_ +pandas uses the `Google `_ standard. Google provides an open source style checker called ``cpplint``, but we use a fork of it that can be found `here `__. Here are *some* of the more common ``cpplint`` issues: @@ -652,7 +652,7 @@ fixes manually. Python (PEP8 / black) ~~~~~~~~~~~~~~~~~~~~~ -*pandas* follows the `PEP8 `_ standard +pandas follows the `PEP8 `_ standard and uses `Black `_ and `Flake8 `_ to ensure a consistent code format throughout the project. @@ -703,7 +703,7 @@ Note that these commands can be run analogously with ``black``. Import formatting ~~~~~~~~~~~~~~~~~ -*pandas* uses `isort `__ to standardise import +pandas uses `isort `__ to standardise import formatting across the codebase. A guide to import layout as per pep8 can be found `here `__. @@ -754,7 +754,7 @@ You can then verify the changes look ok, then git :ref:`commit `_ and you should use these where applicable. This module is private for now but ultimately this should be exposed to third party libraries who want to implement type checking against pandas. +Commonly used types specific to pandas will appear in `pandas._typing `_ and you should use these where applicable. This module is private for now but ultimately this should be exposed to third party libraries who want to implement type checking against pandas. -For example, quite a few functions in *pandas* accept a ``dtype`` argument. This can be expressed as a string like ``"object"``, a ``numpy.dtype`` like ``np.int64`` or even a pandas ``ExtensionDtype`` like ``pd.CategoricalDtype``. Rather than burden the user with having to constantly annotate all of those options, this can simply be imported and reused from the pandas._typing module +For example, quite a few functions in pandas accept a ``dtype`` argument. This can be expressed as a string like ``"object"``, a ``numpy.dtype`` like ``np.int64`` or even a pandas ``ExtensionDtype`` like ``pd.CategoricalDtype``. Rather than burden the user with having to constantly annotate all of those options, this can simply be imported and reused from the pandas._typing module .. code-block:: python @@ -919,10 +919,10 @@ For example, quite a few functions in *pandas* accept a ``dtype`` argument. This This module will ultimately house types for repeatedly used concepts like "path-like", "array-like", "numeric", etc... and can also hold aliases for commonly appearing parameters like `axis`. Development of this module is active so be sure to refer to the source for the most up to date list of available types. -Validating Type Hints +Validating type hints ~~~~~~~~~~~~~~~~~~~~~ -*pandas* uses `mypy `_ to statically analyze the code base and type hints. After making any change you can ensure your type hints are correct by running +pandas uses `mypy `_ to statically analyze the code base and type hints. After making any change you can ensure your type hints are correct by running .. code-block:: shell @@ -933,7 +933,7 @@ Validating Type Hints Testing with continuous integration ----------------------------------- -The *pandas* test suite will run automatically on `Travis-CI `__ and +The pandas test suite will run automatically on `Travis-CI `__ and `Azure Pipelines `__ continuous integration services, once your pull request is submitted. However, if you wish to run the test suite on a branch prior to submitting the pull request, @@ -959,7 +959,7 @@ This is an example of a green build. Test-driven development/code writing ------------------------------------ -*pandas* is serious about testing and strongly encourages contributors to embrace +pandas is serious about testing and strongly encourages contributors to embrace `test-driven development (TDD) `_. This development process "relies on the repetition of a very short development cycle: first the developer writes an (initially failing) automated test case that defines a desired @@ -968,10 +968,10 @@ So, before actually writing any code, you should write your tests. Often the te taken from the original GitHub issue. However, it is always worth considering additional use cases and writing corresponding tests. -Adding tests is one of the most common requests after code is pushed to *pandas*. Therefore, +Adding tests is one of the most common requests after code is pushed to pandas. Therefore, it is worth getting in the habit of writing tests ahead of time so this is never an issue. -Like many packages, *pandas* uses `pytest +Like many packages, pandas uses `pytest `_ and the convenient extensions in `numpy.testing `_. @@ -1018,7 +1018,7 @@ E.g. "# brief comment, see GH#28907" Transitioning to ``pytest`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*pandas* existing test structure is *mostly* class-based, meaning that you will typically find tests wrapped in a class. +pandas existing test structure is *mostly* class-based, meaning that you will typically find tests wrapped in a class. .. code-block:: python @@ -1220,7 +1220,7 @@ Running the test suite ---------------------- The tests can then be run directly inside your Git clone (without having to -install *pandas*) by typing:: +install pandas) by typing:: pytest pandas @@ -1272,9 +1272,9 @@ Running the performance test suite ---------------------------------- Performance matters and it is worth considering whether your code has introduced -performance regressions. *pandas* is in the process of migrating to +performance regressions. pandas is in the process of migrating to `asv benchmarks `__ -to enable easy monitoring of the performance of critical *pandas* operations. +to enable easy monitoring of the performance of critical pandas operations. These benchmarks are all found in the ``pandas/asv_bench`` directory. asv supports both python2 and python3. @@ -1354,14 +1354,14 @@ directive is used. The sphinx syntax for that is: .. code-block:: rst - .. versionadded:: 0.21.0 + .. versionadded:: 1.1.0 -This will put the text *New in version 0.21.0* wherever you put the sphinx +This will put the text *New in version 1.1.0* wherever you put the sphinx directive. This should also be put in the docstring when adding a new function or method (`example `__) or a new keyword argument (`example `__). -Contributing your changes to *pandas* +Contributing your changes to pandas ===================================== .. _contributing.commit-code: @@ -1386,7 +1386,7 @@ Doing 'git status' again should give something like:: # modified: /relative/path/to/file-you-added.py # -Finally, commit your changes to your local repository with an explanatory message. *Pandas* +Finally, commit your changes to your local repository with an explanatory message. pandas uses a convention for commit message prefixes and layout. Here are some common prefixes along with general guidelines for when to use them: @@ -1434,7 +1434,7 @@ like:: upstream git://github.com/pandas-dev/pandas.git (fetch) upstream git://github.com/pandas-dev/pandas.git (push) -Now your code is on GitHub, but it is not yet a part of the *pandas* project. For that to +Now your code is on GitHub, but it is not yet a part of the pandas project. For that to happen, a pull request needs to be submitted on GitHub. Review your code @@ -1539,7 +1539,7 @@ The branch will still exist on GitHub, so to delete it there do:: .. _Gitter: https://gitter.im/pydata/pandas -Tips for a successful Pull Request +Tips for a successful pull request ================================== If you have made it to the `Review your code`_ phase, one of the core contributors may diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst index 1c99b341f6c5a..0c780ad5f5847 100644 --- a/doc/source/development/contributing_docstring.rst +++ b/doc/source/development/contributing_docstring.rst @@ -17,7 +17,7 @@ Also, it is a common practice to generate online (html) documentation automatically from docstrings. `Sphinx `_ serves this purpose. -Next example gives an idea on how a docstring looks like: +The next example gives an idea of what a docstring looks like: .. code-block:: python @@ -26,8 +26,8 @@ Next example gives an idea on how a docstring looks like: Add up two integer numbers. This function simply wraps the `+` operator, and does not - do anything interesting, except for illustrating what is - the docstring of a very simple function. + do anything interesting, except for illustrating what + the docstring of a very simple function looks like. Parameters ---------- @@ -56,14 +56,14 @@ Next example gives an idea on how a docstring looks like: """ return num1 + num2 -Some standards exist about docstrings, so they are easier to read, and they can -be exported to other formats such as html or pdf. +Some standards regarding docstrings exist, which make them easier to read, and allow them +be easily exported to other formats such as html or pdf. The first conventions every Python docstring should follow are defined in `PEP-257 `_. -As PEP-257 is quite open, and some other standards exist on top of it. In the -case of pandas, the numpy docstring convention is followed. The conventions is +As PEP-257 is quite broad, other more specific standards also exist. In the +case of pandas, the numpy docstring convention is followed. These conventions are explained in this document: * `numpydoc docstring guide `_ @@ -80,11 +80,11 @@ about reStructuredText can be found in: * `Quick reStructuredText reference `_ * `Full reStructuredText specification `_ -Pandas has some helpers for sharing docstrings between related classes, see +pandas has some helpers for sharing docstrings between related classes, see :ref:`docstring.sharing`. -The rest of this document will summarize all the above guides, and will -provide additional convention specific to the pandas project. +The rest of this document will summarize all the above guidelines, and will +provide additional conventions specific to the pandas project. .. _docstring.tutorial: @@ -101,9 +101,9 @@ left before or after the docstring. The text starts in the next line after the opening quotes. The closing quotes have their own line (meaning that they are not at the end of the last sentence). -In rare occasions reST styles like bold text or italics will be used in +On rare occasions reST styles like bold text or italics will be used in docstrings, but is it common to have inline code, which is presented between -backticks. It is considered inline code: +backticks. The following are considered inline code: * The name of a parameter * Python code, a module, function, built-in, type, literal... (e.g. ``os``, @@ -160,7 +160,7 @@ backticks. It is considered inline code: .. _docstring.short_summary: -Section 1: Short summary +Section 1: short summary ~~~~~~~~~~~~~~~~~~~~~~~~ The short summary is a single sentence that expresses what the function does in @@ -228,15 +228,15 @@ infinitive verb. .. _docstring.extended_summary: -Section 2: Extended summary +Section 2: extended summary ~~~~~~~~~~~~~~~~~~~~~~~~~~~ The extended summary provides details on what the function does. It should not go into the details of the parameters, or discuss implementation notes, which go in other sections. -A blank line is left between the short summary and the extended summary. And -every paragraph in the extended summary is finished by a dot. +A blank line is left between the short summary and the extended summary. +Every paragraph in the extended summary ends with a dot. The extended summary should provide details on why the function is useful and their use cases, if it is not too generic. @@ -259,7 +259,7 @@ their use cases, if it is not too generic. .. _docstring.parameters: -Section 3: Parameters +Section 3: parameters ~~~~~~~~~~~~~~~~~~~~~ The details of the parameters will be added in this section. This section has @@ -424,7 +424,7 @@ For axis, the convention is to use something like: .. _docstring.returns: -Section 4: Returns or Yields +Section 4: returns or yields ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ If the method returns a value, it will be documented in this section. Also @@ -505,7 +505,7 @@ If the method yields its value: .. _docstring.see_also: -Section 5: See Also +Section 5: see also ~~~~~~~~~~~~~~~~~~~ This section is used to let users know about pandas functionality @@ -542,19 +542,19 @@ first (not an alias like ``np``). If the function is in a module which is not the main one, like ``scipy.sparse``, list the full module (e.g. ``scipy.sparse.coo_matrix``). -This section, as the previous, also has a header, "See Also" (note the capital -S and A). Also followed by the line with hyphens, and preceded by a blank line. +This section has a header, "See Also" (note the capital +S and A), followed by the line with hyphens and preceded by a blank line. After the header, we will add a line for each related method or function, followed by a space, a colon, another space, and a short description that -illustrated what this method or function does, why is it relevant in this -context, and what are the key differences between the documented function and -the one referencing. The description must also finish with a dot. +illustrates what this method or function does, why is it relevant in this +context, and what the key differences are between the documented function and +the one being referenced. The description must also end with a dot. -Note that in "Returns" and "Yields", the description is located in the -following line than the type. But in this section it is located in the same -line, with a colon in between. If the description does not fit in the same -line, it can continue in the next ones, but it has to be indented in them. +Note that in "Returns" and "Yields", the description is located on the line +after the type. In this section, however, it is located on the same +line, with a colon in between. If the description does not fit on the same +line, it can continue onto other lines which must be further indented. For example: @@ -583,11 +583,11 @@ For example: .. _docstring.notes: -Section 6: Notes +Section 6: notes ~~~~~~~~~~~~~~~~ This is an optional section used for notes about the implementation of the -algorithm. Or to document technical aspects of the function behavior. +algorithm, or to document technical aspects of the function behavior. Feel free to skip it, unless you are familiar with the implementation of the algorithm, or you discover some counter-intuitive behavior while writing the @@ -597,18 +597,18 @@ This section follows the same format as the extended summary section. .. _docstring.examples: -Section 7: Examples +Section 7: examples ~~~~~~~~~~~~~~~~~~~ -This is one of the most important sections of a docstring, even if it is -placed in the last position. As often, people understand concepts better -with examples, than with accurate explanations. +This is one of the most important sections of a docstring, despite being +placed in the last position, as often people understand concepts better +by example than through accurate explanations. Examples in docstrings, besides illustrating the usage of the function or -method, must be valid Python code, that in a deterministic way returns the -presented output, and that can be copied and run by users. +method, must be valid Python code, that returns the given output in a +deterministic way, and that can be copied and run by users. -They are presented as a session in the Python terminal. `>>>` is used to +Examples are presented as a session in the Python terminal. `>>>` is used to present code. `...` is used for code continuing from the previous line. Output is presented immediately after the last line of code generating the output (no blank lines in between). Comments describing the examples can @@ -636,7 +636,7 @@ A simple example could be: Return the first elements of the Series. This function is mainly useful to preview the values of the - Series without displaying the whole of it. + Series without displaying all of it. Parameters ---------- @@ -932,7 +932,7 @@ plot will be generated automatically when building the documentation. Sharing docstrings ------------------ -Pandas has a system for sharing docstrings, with slight variations, between +pandas has a system for sharing docstrings, with slight variations, between classes. This helps us keep docstrings consistent, while keeping things clear for the user reading. It comes at the cost of some complexity when writing. @@ -998,4 +998,4 @@ mapping function names to docstrings. Wherever possible, we prefer using See ``pandas.core.generic.NDFrame.fillna`` for an example template, and ``pandas.core.series.Series.fillna`` and ``pandas.core.generic.frame.fillna`` -for the filled versions. +for the filled versions. \ No newline at end of file diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst index 33646e5d74757..fbd83af3de82e 100644 --- a/doc/source/development/developer.rst +++ b/doc/source/development/developer.rst @@ -62,7 +62,7 @@ for each column, *including the index columns*. This has JSON form: See below for the detailed specification for these. -Index Metadata Descriptors +Index metadata descriptors ~~~~~~~~~~~~~~~~~~~~~~~~~~ ``RangeIndex`` can be stored as metadata only, not requiring serialization. The @@ -89,7 +89,7 @@ with other column names) a disambiguating name with pattern matching columns, ``name`` attribute is always stored in the column descriptors as above. -Column Metadata +Column metadata ~~~~~~~~~~~~~~~ ``pandas_type`` is the logical type of the column, and is one of: @@ -182,4 +182,4 @@ As an example of fully-formed metadata: 'creator': { 'library': 'pyarrow', 'version': '0.13.0' - }} + }} \ No newline at end of file diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index 270f20e8118bc..d9fb2643e8a1a 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -7,7 +7,7 @@ Extending pandas **************** While pandas provides a rich set of methods, containers, and data types, your -needs may not be fully satisfied. Pandas offers a few options for extending +needs may not be fully satisfied. pandas offers a few options for extending pandas. .. _extending.register-accessors: @@ -80,8 +80,8 @@ Extension types The :class:`pandas.api.extensions.ExtensionDtype` and :class:`pandas.api.extensions.ExtensionArray` APIs are new and experimental. They may change between versions without warning. -Pandas defines an interface for implementing data types and arrays that *extend* -NumPy's type system. Pandas itself uses the extension system for some types +pandas defines an interface for implementing data types and arrays that *extend* +NumPy's type system. pandas itself uses the extension system for some types that aren't built into NumPy (categorical, period, interval, datetime with timezone). @@ -122,7 +122,7 @@ This class provides all the array-like functionality. ExtensionArrays are limited to 1 dimension. An ExtensionArray is linked to an ExtensionDtype via the ``dtype`` attribute. -Pandas makes no restrictions on how an extension array is created via its +pandas makes no restrictions on how an extension array is created via its ``__new__`` or ``__init__``, and puts no restrictions on how you store your data. We do require that your array be convertible to a NumPy array, even if this is relatively expensive (as it is for ``Categorical``). @@ -139,7 +139,7 @@ and comments contain guidance for properly implementing the interface. .. _extending.extension.operator: -:class:`~pandas.api.extensions.ExtensionArray` Operator Support +:class:`~pandas.api.extensions.ExtensionArray` operator support ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. versionadded:: 0.24.0 @@ -210,7 +210,7 @@ will .. _extending.extension.ufunc: -NumPy Universal Functions +NumPy universal functions ^^^^^^^^^^^^^^^^^^^^^^^^^ :class:`Series` implements ``__array_ufunc__``. As part of the implementation, @@ -224,7 +224,7 @@ for an example. As part of your implementation, we require that you defer to pandas when a pandas container (:class:`Series`, :class:`DataFrame`, :class:`Index`) is detected in ``inputs``. -If any of those is present, you should return ``NotImplemented``. Pandas will take care of +If any of those is present, you should return ``NotImplemented``. pandas will take care of unboxing the array from the container and re-calling the ufunc with the unwrapped input. .. _extending.extension.testing: @@ -501,4 +501,4 @@ registers the default "matplotlib" backend as follows. More information on how to implement a third-party plotting backend can be found at -https://github.com/pandas-dev/pandas/blob/master/pandas/plotting/__init__.py#L1. +https://github.com/pandas-dev/pandas/blob/master/pandas/plotting/__init__.py#L1. \ No newline at end of file diff --git a/doc/source/development/internals.rst b/doc/source/development/internals.rst index 748caae295460..8f1c3d5d818c2 100644 --- a/doc/source/development/internals.rst +++ b/doc/source/development/internals.rst @@ -85,20 +85,14 @@ if you compute the levels and codes yourself, please be careful. Values ~~~~~~ -Pandas extends NumPy's type system with custom types, like ``Categorical`` or +pandas extends NumPy's type system with custom types, like ``Categorical`` or datetimes with a timezone, so we have multiple notions of "values". For 1-D containers (``Index`` classes and ``Series``) we have the following convention: -* ``cls._ndarray_values`` is *always* a NumPy ``ndarray``. Ideally, - ``_ndarray_values`` is cheap to compute. For example, for a ``Categorical``, - this returns the codes, not the array of objects. * ``cls._values`` refers is the "best possible" array. This could be an - ``ndarray``, ``ExtensionArray``, or in ``Index`` subclass (note: we're in the - process of removing the index subclasses here so that it's always an - ``ndarray`` or ``ExtensionArray``). + ``ndarray`` or ``ExtensionArray``. -So, for example, ``Series[category]._values`` is a ``Categorical``, while -``Series[category]._ndarray_values`` is the underlying codes. +So, for example, ``Series[category]._values`` is a ``Categorical``. .. _ref-subclassing-pandas: diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index e65b66fc243c5..9f9e9dc2631f3 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -1,7 +1,7 @@ .. _maintaining: ****************** -Pandas Maintenance +pandas maintenance ****************** This guide is for pandas' maintainers. It may also be interesting to contributors @@ -13,7 +13,7 @@ The main contributing guide is available at :ref:`contributing`. Roles ----- -Pandas uses two levels of permissions: **triage** and **core** team members. +pandas uses two levels of permissions: **triage** and **core** team members. Triage members can label and close issues and pull requests. @@ -25,7 +25,7 @@ GitHub publishes the full `list of permissions`_. Tasks ----- -Pandas is largely a volunteer project, so these tasks shouldn't be read as +pandas is largely a volunteer project, so these tasks shouldn't be read as "expectations" of triage and maintainers. Rather, they're general descriptions of what it means to be a maintainer. @@ -41,7 +41,7 @@ reading. .. _maintaining.triage: -Issue Triage +Issue triage ------------ @@ -123,7 +123,7 @@ Here's a typical workflow for triaging a newly opened issue. .. _maintaining.closing: -Closing Issues +Closing issues -------------- Be delicate here: many people interpret closing an issue as us saying that the @@ -132,7 +132,7 @@ respond or self-close their issue if it's determined that the behavior is not a or the feature is out of scope. Sometimes reporters just go away though, and we'll close the issue after the conversation has died. -Reviewing Pull Requests +Reviewing pull requests ----------------------- Anybody can review a pull request: regular contributors, triagers, or core-team @@ -144,7 +144,7 @@ members. Here are some guidelines to check. * User-facing changes should have a whatsnew in the appropriate file. * Regression tests should reference the original GitHub issue number like ``# GH-1234``. -Cleaning up old Issues +Cleaning up old issues ---------------------- Every open issue in pandas has a cost. Open issues make finding duplicates harder, @@ -164,7 +164,7 @@ If an older issue lacks a reproducible example, label it as "Needs Info" and ask them to provide one (or write one yourself if possible). If one isn't provide reasonably soon, close it according to the policies in :ref:`maintaining.closing`. -Cleaning up old Pull Requests +Cleaning up old pull requests ----------------------------- Occasionally, contributors are unable to finish off a pull request. diff --git a/doc/source/development/meeting.rst b/doc/source/development/meeting.rst index 1d19408692cda..35826af5912c2 100644 --- a/doc/source/development/meeting.rst +++ b/doc/source/development/meeting.rst @@ -1,7 +1,7 @@ .. _meeting: ================== -Developer Meetings +Developer meetings ================== We hold regular developer meetings on the second Wednesday @@ -25,8 +25,7 @@ This calendar shows all the developer meetings. You can subscribe to this calendar with the following links: * `iCal `__ -* `Google calendar `__ +* `Google calendar `__ Additionally, we'll sometimes have one-off meetings on specific topics. These will be published on the same calendar. - diff --git a/doc/source/development/policies.rst b/doc/source/development/policies.rst index 224948738341e..1031bbfc46457 100644 --- a/doc/source/development/policies.rst +++ b/doc/source/development/policies.rst @@ -6,12 +6,12 @@ Policies .. _policies.version: -Version Policy +Version policy ~~~~~~~~~~~~~~ .. versionchanged:: 1.0.0 -Pandas uses a loose variant of semantic versioning (`SemVer`_) to govern +pandas uses a loose variant of semantic versioning (`SemVer`_) to govern deprecations, API compatibility, and version numbering. A pandas release number is made up of ``MAJOR.MINOR.PATCH``. @@ -23,7 +23,7 @@ and how to migrate existing code to the new behavior. Whenever possible, a deprecation path will be provided rather than an outright breaking change. -Pandas will introduce deprecations in **minor** releases. These deprecations +pandas will introduce deprecations in **minor** releases. These deprecations will preserve the existing behavior while emitting a warning that provide guidance on: @@ -39,19 +39,19 @@ deprecation removed in the next next major release (2.0.0). .. note:: - Pandas will sometimes make *behavior changing* bug fixes, as part of + pandas will sometimes make *behavior changing* bug fixes, as part of minor or patch releases. Whether or not a change is a bug fix or an API-breaking change is a judgement call. We'll do our best, and we invite you to participate in development discussion on the issue tracker or mailing list. These policies do not apply to features marked as **experimental** in the documentation. -Pandas may change the behavior of experimental features at any time. +pandas may change the behavior of experimental features at any time. -Python Support +Python support ~~~~~~~~~~~~~~ -Pandas will only drop support for specific Python versions (e.g. 3.6.x, 3.7.x) in +pandas will only drop support for specific Python versions (e.g. 3.6.x, 3.7.x) in pandas **major** releases. .. _SemVer: https://semver.org diff --git a/doc/source/development/roadmap.rst b/doc/source/development/roadmap.rst index fafe63d80249c..d331491d02883 100644 --- a/doc/source/development/roadmap.rst +++ b/doc/source/development/roadmap.rst @@ -22,8 +22,8 @@ See :ref:`roadmap.evolution` for proposing changes to this document. Extensibility ------------- -Pandas :ref:`extending.extension-types` allow for extending NumPy types with custom -data types and array storage. Pandas uses extension types internally, and provides +pandas :ref:`extending.extension-types` allow for extending NumPy types with custom +data types and array storage. pandas uses extension types internally, and provides an interface for 3rd-party libraries to define their own custom data types. Many parts of pandas still unintentionally convert data to a NumPy array. @@ -71,7 +71,7 @@ Block manager rewrite We'd like to replace pandas current internal data structures (a collection of 1 or 2-D arrays) with a simpler collection of 1-D arrays. -Pandas internal data model is quite complex. A DataFrame is made up of +pandas internal data model is quite complex. A DataFrame is made up of one or more 2-dimensional "blocks", with one or more blocks per dtype. This collection of 2-D arrays is managed by the BlockManager. @@ -132,7 +132,7 @@ Some specific goals include Performance monitoring ---------------------- -Pandas uses `airspeed velocity `__ to +pandas uses `airspeed velocity `__ to monitor for performance regressions. ASV itself is a fabulous tool, but requires some additional work to be integrated into an open source project's workflow. @@ -152,10 +152,10 @@ We'd like to fund improvements and maintenance of these tools to .. _roadmap.evolution: -Roadmap Evolution +Roadmap evolution ----------------- -Pandas continues to evolve. The direction is primarily determined by community +pandas continues to evolve. The direction is primarily determined by community interest. Everyone is welcome to review existing items on the roadmap and to propose a new item. diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index b7e53b84f0e02..fd5e7c552fe0a 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -5,7 +5,7 @@ {{ header }} **************** -Pandas ecosystem +pandas ecosystem **************** Increasingly, packages are being built on top of pandas to address specific needs diff --git a/doc/source/getting_started/comparison/comparison_with_sas.rst b/doc/source/getting_started/comparison/comparison_with_sas.rst index 4e284fe7b5968..f12d97d1d0fde 100644 --- a/doc/source/getting_started/comparison/comparison_with_sas.rst +++ b/doc/source/getting_started/comparison/comparison_with_sas.rst @@ -698,7 +698,7 @@ In pandas this would be written as: tips.groupby(['sex', 'smoker']).first() -Other Considerations +Other considerations -------------------- Disk vs memory @@ -752,4 +752,4 @@ to interop data between SAS and pandas is to serialize to csv. Wall time: 14.6 s In [9]: %time df = pd.read_csv('big.csv') - Wall time: 4.86 s + Wall time: 4.86 s \ No newline at end of file diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst index 6a03c06de3699..c46ec9b3f7090 100644 --- a/doc/source/getting_started/comparison/comparison_with_sql.rst +++ b/doc/source/getting_started/comparison/comparison_with_sql.rst @@ -75,7 +75,7 @@ Filtering in SQL is done via a WHERE clause. LIMIT 5; DataFrames can be filtered in multiple ways; the most intuitive of which is using -`boolean indexing `_. +:ref:`boolean indexing ` .. ipython:: python @@ -388,10 +388,10 @@ In pandas, you can use :meth:`~pandas.concat` in conjunction with pd.concat([df1, df2]).drop_duplicates() -Pandas equivalents for some SQL analytic and aggregate functions +pandas equivalents for some SQL analytic and aggregate functions ---------------------------------------------------------------- -Top N rows with offset +Top n rows with offset ~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: sql @@ -405,7 +405,7 @@ Top N rows with offset tips.nlargest(10 + 5, columns='tip').tail(10) -Top N rows per group +Top n rows per group ~~~~~~~~~~~~~~~~~~~~ .. code-block:: sql diff --git a/doc/source/getting_started/index.rst b/doc/source/getting_started/index.rst index a2f8f79f22ae4..3f15c91f83c6a 100644 --- a/doc/source/getting_started/index.rst +++ b/doc/source/getting_started/index.rst @@ -9,8 +9,6 @@ Getting started Installation ------------ -Before you can use pandas, you’ll need to get it installed. - .. raw:: html
@@ -23,7 +21,7 @@ Before you can use pandas, you’ll need to get it installed.

-Pandas is part of the `Anaconda `__ distribution and can be +pandas is part of the `Anaconda `__ distribution and can be installed with Anaconda or Miniconda: .. raw:: html @@ -49,7 +47,7 @@ installed with Anaconda or Miniconda:

-Pandas can be installed via pip from `PyPI `__. +pandas can be installed via pip from `PyPI `__. .. raw:: html @@ -103,7 +101,7 @@ Intro to pandas

- What kind of data does Pandas handle? + What kind of data does pandas handle?
@@ -117,8 +115,8 @@ Intro to pandas
-When working with tabular data, such as data stored in spreadsheets or databases, Pandas is the right tool for you. Pandas will help you -to explore, clean and process your data. In Pandas, a data table is called a :class:`DataFrame`. +When working with tabular data, such as data stored in spreadsheets or databases, pandas is the right tool for you. pandas will help you +to explore, clean and process your data. In pandas, a data table is called a :class:`DataFrame`. .. image:: ../_static/schemas/01_table_dataframe.svg :align: center @@ -164,7 +162,7 @@ to explore, clean and process your data. In Pandas, a data table is called a :cl
-Pandas supports the integration with many file formats or data sources out of the box (csv, excel, sql, json, parquet,…). Importing data from each of these +pandas supports the integration with many file formats or data sources out of the box (csv, excel, sql, json, parquet,…). Importing data from each of these data sources is provided by function with the prefix ``read_*``. Similarly, the ``to_*`` methods are used to store data. .. image:: ../_static/schemas/02_io_readwrite.svg @@ -212,7 +210,7 @@ data sources is provided by function with the prefix ``read_*``. Similarly, the
Selecting or filtering specific rows and/or columns? Filtering the data on a condition? Methods for slicing, selecting, and extracting the -data you need are available in Pandas. +data you need are available in pandas. .. image:: ../_static/schemas/03_subset_columns_rows.svg :align: center @@ -258,7 +256,7 @@ data you need are available in Pandas.
-Pandas provides plotting your data out of the box, using the power of Matplotlib. You can pick the plot type (scatter, bar, boxplot,...) +pandas provides plotting your data out of the box, using the power of Matplotlib. You can pick the plot type (scatter, bar, boxplot,...) corresponding to your data. .. image:: ../_static/schemas/04_plot_overview.svg @@ -492,7 +490,7 @@ Multiple tables can be concatenated both column wise as row wise and database-li
-Pandas has great support for time series and has an extensive set of tools for working with dates, times, and time-indexed data. +pandas has great support for time series and has an extensive set of tools for working with dates, times, and time-indexed data. .. raw:: html @@ -535,7 +533,7 @@ Pandas has great support for time series and has an extensive set of tools for w
-Data sets do not only contain numerical data. Pandas provides a wide range of functions to cleaning textual data and extract useful information from it. +Data sets do not only contain numerical data. pandas provides a wide range of functions to cleaning textual data and extract useful information from it. .. raw:: html @@ -568,9 +566,8 @@ Data sets do not only contain numerical data. Pandas provides a wide range of fu Coming from... -------------- -Currently working with other software for data manipulation in a tabular format? You're probably familiar to typical -data operations and know *what* to do with your tabular data, but lacking the syntax to execute these operations. Get to know -the pandas syntax by looking for equivalents from the software you already know: +Are you familiar with other software for manipulating tablular data? Learn +the pandas-equivalent operations compared to software you already know: .. raw:: html @@ -580,7 +577,7 @@ the pandas syntax by looking for equivalents from the software you already know:
R project logo
-

The R programming language provides the data.frame data structure and multiple packages, +

The R programming language provides the dataframe data structure and multiple packages, such as tidyverse use and extend data.frames for convenient data handling functionalities similar to pandas.

@@ -597,7 +594,7 @@ the pandas syntax by looking for equivalents from the software you already know:
SQL logo
-

Already familiar to SELECT, GROUP BY, JOIN,...? +

Already familiar to SELECT, GROUP BY, JOIN, etc.? Most of these SQL manipulations do have equivalents in pandas.

.. container:: custom-button @@ -615,7 +612,7 @@ the pandas syntax by looking for equivalents from the software you already know:

The data set included in the STATA statistical software suite corresponds - to the pandas data.frame. Many of the operations known from STATA have an equivalent + to the pandas dataframe. Many of the operations known from STATA have an equivalent in pandas.

.. container:: custom-button @@ -632,8 +629,8 @@ the pandas syntax by looking for equivalents from the software you already know: SAS logo

The SAS statistical software suite - also provides the data set corresponding to the pandas data.frame. - Also vectorized operations, filtering, string processing operations,... from SAS have similar + also provides the data set corresponding to the pandas dataframe. + Also SAS vectorized operations, filtering, string processing operations, and more have similar functions in pandas.

.. container:: custom-button @@ -648,11 +645,16 @@ the pandas syntax by looking for equivalents from the software you already know:
-Community tutorials -------------------- +Tutorials +--------- + +For a quick overview of pandas functionality, see :ref:`10 Minutes to pandas<10min>`. + +You can also reference the pandas `cheat sheet `_ +for a succinct guide for manipulating data with pandas. The community produces a wide variety of tutorials available online. Some of the -material is enlisted in the community contributed :ref:`tutorials`. +material is enlisted in the community contributed :ref:`communitytutorials`. .. If you update this toctree, also update the manual toctree in the @@ -664,9 +666,6 @@ material is enlisted in the community contributed :ref:`tutorials`. install overview - 10min intro_tutorials/index - basics - dsintro comparison/index tutorials diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index bc1be527696a5..7fa2233e79fc0 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -221,7 +221,7 @@ Package Minimum support ================================================================ ========================== `setuptools `__ 24.2.0 `NumPy `__ 1.13.3 -`python-dateutil `__ 2.6.1 +`python-dateutil `__ 2.7.3 `pytz `__ 2017.2 ================================================================ ========================== diff --git a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst index 02e59b3c81755..9ee3bfc3b8e79 100644 --- a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst +++ b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst @@ -26,7 +26,7 @@ documentation. -Pandas data table representation +pandas data table representation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. image:: ../../_static/schemas/01_table_dataframe.svg diff --git a/doc/source/getting_started/intro_tutorials/02_read_write.rst b/doc/source/getting_started/intro_tutorials/02_read_write.rst index 797bdbcf25d17..412a5f9e7485f 100644 --- a/doc/source/getting_started/intro_tutorials/02_read_write.rst +++ b/doc/source/getting_started/intro_tutorials/02_read_write.rst @@ -118,7 +118,7 @@ done by requesting the pandas ``dtypes`` attribute: titanic.dtypes For each of the columns, the used data type is enlisted. The data types -in this ``DataFrame`` are integers (``int64``), floats (``float63``) and +in this ``DataFrame`` are integers (``int64``), floats (``float64``) and strings (``object``). .. note:: @@ -225,7 +225,7 @@ The method :meth:`~DataFrame.info` provides technical information about a
To user guide -For a complete overview of the input and output possibilites from and to pandas, see the user guide section about :ref:`reader and writer functions `. +For a complete overview of the input and output possibilities from and to pandas, see the user guide section about :ref:`reader and writer functions `. .. raw:: html diff --git a/doc/source/getting_started/intro_tutorials/03_subset_data.rst b/doc/source/getting_started/intro_tutorials/03_subset_data.rst index 7a4347905ad8d..4167166a3f34a 100644 --- a/doc/source/getting_started/intro_tutorials/03_subset_data.rst +++ b/doc/source/getting_started/intro_tutorials/03_subset_data.rst @@ -88,7 +88,7 @@ name of the column of interest. Each column in a :class:`DataFrame` is a :class:`Series`. As a single column is -selected, the returned object is a pandas :class:`DataFrame`. We can verify this +selected, the returned object is a pandas :class:`Series`. We can verify this by checking the type of the output: .. ipython:: python @@ -101,7 +101,7 @@ And have a look at the ``shape`` of the output: titanic["Age"].shape -:attr:`DataFrame.shape` is an attribute (remember :ref:`tutorial on reading and writing <10min_tut_02_read_write>`, do not use parantheses for attributes) of a +:attr:`DataFrame.shape` is an attribute (remember :ref:`tutorial on reading and writing <10min_tut_02_read_write>`, do not use parentheses for attributes) of a pandas ``Series`` and ``DataFrame`` containing the number of rows and columns: *(nrows, ncolumns)*. A pandas Series is 1-dimensional and only the number of rows is returned. diff --git a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst index f317e7a1f91b4..b6b3c97f2405b 100644 --- a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst +++ b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst @@ -165,7 +165,7 @@ index. For example: .. note:: The existence of multiple row/column indices at the same time has not been mentioned within these tutorials. *Hierarchical indexing* - or *MultiIndex* is an advanced and powerfull pandas feature to analyze + or *MultiIndex* is an advanced and powerful pandas feature to analyze higher dimensional data. Multi-indexing is out of scope for this pandas introduction. For the diff --git a/doc/source/getting_started/intro_tutorials/09_timeseries.rst b/doc/source/getting_started/intro_tutorials/09_timeseries.rst index d5b4b316130bb..15bdf43543d9a 100644 --- a/doc/source/getting_started/intro_tutorials/09_timeseries.rst +++ b/doc/source/getting_started/intro_tutorials/09_timeseries.rst @@ -96,7 +96,7 @@ objects. In pandas we call these datetime objects similar to pd.read_csv("../data/air_quality_no2_long.csv", parse_dates=["datetime"]) -Why are these :class:`pandas.Timestamp` objects useful. Let’s illustrate the added +Why are these :class:`pandas.Timestamp` objects useful? Let’s illustrate the added value with some example cases. What is the start and end date of the time series data set working @@ -106,7 +106,7 @@ value with some example cases. air_quality["datetime"].min(), air_quality["datetime"].max() -Using :class:`pandas.Timestamp` for datetimes enable us to calculate with date +Using :class:`pandas.Timestamp` for datetimes enables us to calculate with date information and make them comparable. Hence, we can use this to get the length of our time series: @@ -122,7 +122,7 @@ from the standard Python library and defining a time duration.
To user guide -The different time concepts supported by pandas are explained in the user guide section on :ref:`time related concepts `. +The various time concepts supported by pandas are explained in the user guide section on :ref:`time related concepts `. .. raw:: html @@ -157,7 +157,7 @@ accessible by the ``dt`` accessor. An overview of the existing date properties is given in the :ref:`time and date components overview table `. More details about the ``dt`` accessor -to return datetime like properties is explained in a dedicated section on the :ref:`dt accessor `. +to return datetime like properties are explained in a dedicated section on the :ref:`dt accessor `. .. raw:: html @@ -335,7 +335,7 @@ When defined, the frequency of the time series is provided by the
  • -Make a plot of the daily median :math:`NO_2` value in each of the stations. +Make a plot of the daily mean :math:`NO_2` value in each of the stations. .. ipython:: python :okwarning: @@ -353,7 +353,7 @@ Make a plot of the daily median :math:`NO_2` value in each of the stations.
    To user guide -More details on the power of time series ``resampling`` is provided in the user gudie section on :ref:`resampling `. +More details on the power of time series ``resampling`` is provided in the user guide section on :ref:`resampling `. .. raw:: html @@ -366,7 +366,7 @@ More details on the power of time series ``resampling`` is provided in the user - Valid date strings can be converted to datetime objects using ``to_datetime`` function or as part of read functions. -- Datetime objects in pandas supports calculations, logical operations +- Datetime objects in pandas support calculations, logical operations and convenient date-related properties using the ``dt`` accessor. - A ``DatetimeIndex`` contains these date-related properties and supports convenient slicing. @@ -382,8 +382,8 @@ More details on the power of time series ``resampling`` is provided in the user
    To user guide -A full overview on time series is given in the pages on :ref:`time series and date functionality `. +A full overview on time series is given on the pages on :ref:`time series and date functionality `. .. raw:: html -
    \ No newline at end of file +
    diff --git a/doc/source/getting_started/intro_tutorials/10_text_data.rst b/doc/source/getting_started/intro_tutorials/10_text_data.rst index 3ff64875d807b..4c03a276090d7 100644 --- a/doc/source/getting_started/intro_tutorials/10_text_data.rst +++ b/doc/source/getting_started/intro_tutorials/10_text_data.rst @@ -188,7 +188,7 @@ Which passenger of the titanic has the longest name? titanic["Name"].str.len() -To get the longest name we first have to get the lenghts of each of the +To get the longest name we first have to get the lengths of each of the names in the ``Name`` column. By using pandas string methods, the :meth:`Series.str.len` function is applied to each of the names individually (element-wise). @@ -238,7 +238,7 @@ a ``dictionary`` to define the mapping ``{from : to}``.
.. warning:: - There is also a :meth:`~Series.str.replace` methods available to replace a + There is also a :meth:`~Series.str.replace` method available to replace a specific set of characters. However, when having a mapping of multiple values, this would become: diff --git a/doc/source/getting_started/tutorials.rst b/doc/source/getting_started/tutorials.rst index 434d791474807..4c2d0621c6103 100644 --- a/doc/source/getting_started/tutorials.rst +++ b/doc/source/getting_started/tutorials.rst @@ -1,26 +1,14 @@ -.. _tutorials: +.. _communitytutorials: {{ header }} -********* -Tutorials -********* +******************* +Community tutorials +******************* -This is a guide to many pandas tutorials, geared mainly for new users. +This is a guide to many pandas tutorials by the community, geared mainly for new users. -Internal guides -=============== - -pandas' own :ref:`10 Minutes to pandas<10min>`. - -More complex recipes are in the :ref:`Cookbook`. - -A handy pandas `cheat sheet `_. - -Community guides -================ - -pandas Cookbook by Julia Evans +pandas cookbook by Julia Evans ------------------------------ The goal of this 2015 cookbook (by `Julia Evans `_) is to @@ -30,7 +18,7 @@ entails. For the table of contents, see the `pandas-cookbook GitHub repository `_. -Learn Pandas by Hernan Rojas +Learn pandas by Hernan Rojas ---------------------------- A set of lesson for new pandas users: https://bitbucket.org/hrojas/learn-pandas diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index 7eb25790f6a7a..4aba8f709fba0 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -119,7 +119,6 @@ programming language. :titlesonly: {% endif %} {% if not single_doc %} - What's New in 1.1.0 getting_started/index user_guide/index {% endif -%} diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index c71350ecd73b3..1725c415fa020 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -3,7 +3,7 @@ .. _api.arrays: ============= -Pandas arrays +pandas arrays ============= .. currentmodule:: pandas diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 78fdfbfd28144..4c0763e091b75 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -37,7 +37,6 @@ objects. api.extensions.ExtensionArray._from_factorized api.extensions.ExtensionArray._from_sequence api.extensions.ExtensionArray._from_sequence_of_strings - api.extensions.ExtensionArray._ndarray_values api.extensions.ExtensionArray._reduce api.extensions.ExtensionArray._values_for_argsort api.extensions.ExtensionArray._values_for_factorize diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index b326bbb5a465e..cf81540a77d11 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -251,7 +251,7 @@ Combining / joining / merging DataFrame.merge DataFrame.update -Time series-related +Time Series-related ~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ diff --git a/doc/source/reference/general_utility_functions.rst b/doc/source/reference/general_utility_functions.rst index 0d9e0b0f4c668..575b82b4b06de 100644 --- a/doc/source/reference/general_utility_functions.rst +++ b/doc/source/reference/general_utility_functions.rst @@ -108,3 +108,11 @@ Scalar introspection api.types.is_re api.types.is_re_compilable api.types.is_scalar + +Bug report function +------------------- +.. autosummary:: + :toctree: api/ + + show_versions + diff --git a/doc/source/reference/indexing.rst b/doc/source/reference/indexing.rst index ab6ea5aea6c61..ba12c19763605 100644 --- a/doc/source/reference/indexing.rst +++ b/doc/source/reference/indexing.rst @@ -328,7 +328,7 @@ DatetimeIndex DatetimeIndex -Time/Date components +Time/date components ~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: api/ diff --git a/doc/source/reference/offset_frequency.rst b/doc/source/reference/offset_frequency.rst index fc1c6d6bd6d47..17544cb7a1225 100644 --- a/doc/source/reference/offset_frequency.rst +++ b/doc/source/reference/offset_frequency.rst @@ -37,6 +37,7 @@ Methods DateOffset.onOffset DateOffset.is_anchored DateOffset.is_on_offset + DateOffset.__call__ BusinessDay ----------- @@ -69,6 +70,7 @@ Methods BusinessDay.onOffset BusinessDay.is_anchored BusinessDay.is_on_offset + BusinessDay.__call__ BusinessHour ------------ @@ -100,6 +102,7 @@ Methods BusinessHour.onOffset BusinessHour.is_anchored BusinessHour.is_on_offset + BusinessHour.__call__ CustomBusinessDay ----------------- @@ -131,6 +134,7 @@ Methods CustomBusinessDay.onOffset CustomBusinessDay.is_anchored CustomBusinessDay.is_on_offset + CustomBusinessDay.__call__ CustomBusinessHour ------------------ @@ -162,6 +166,7 @@ Methods CustomBusinessHour.onOffset CustomBusinessHour.is_anchored CustomBusinessHour.is_on_offset + CustomBusinessHour.__call__ MonthOffset ----------- @@ -194,6 +199,7 @@ Methods MonthOffset.onOffset MonthOffset.is_anchored MonthOffset.is_on_offset + MonthOffset.__call__ MonthEnd -------- @@ -226,6 +232,7 @@ Methods MonthEnd.onOffset MonthEnd.is_anchored MonthEnd.is_on_offset + MonthEnd.__call__ MonthBegin ---------- @@ -258,6 +265,7 @@ Methods MonthBegin.onOffset MonthBegin.is_anchored MonthBegin.is_on_offset + MonthBegin.__call__ BusinessMonthEnd ---------------- @@ -290,6 +298,7 @@ Methods BusinessMonthEnd.onOffset BusinessMonthEnd.is_anchored BusinessMonthEnd.is_on_offset + BusinessMonthEnd.__call__ BusinessMonthBegin ------------------ @@ -322,6 +331,7 @@ Methods BusinessMonthBegin.onOffset BusinessMonthBegin.is_anchored BusinessMonthBegin.is_on_offset + BusinessMonthBegin.__call__ CustomBusinessMonthEnd ---------------------- @@ -354,6 +364,7 @@ Methods CustomBusinessMonthEnd.onOffset CustomBusinessMonthEnd.is_anchored CustomBusinessMonthEnd.is_on_offset + CustomBusinessMonthEnd.__call__ CustomBusinessMonthBegin ------------------------ @@ -386,6 +397,7 @@ Methods CustomBusinessMonthBegin.onOffset CustomBusinessMonthBegin.is_anchored CustomBusinessMonthBegin.is_on_offset + CustomBusinessMonthBegin.__call__ SemiMonthOffset --------------- @@ -418,6 +430,7 @@ Methods SemiMonthOffset.onOffset SemiMonthOffset.is_anchored SemiMonthOffset.is_on_offset + SemiMonthOffset.__call__ SemiMonthEnd ------------ @@ -450,6 +463,7 @@ Methods SemiMonthEnd.onOffset SemiMonthEnd.is_anchored SemiMonthEnd.is_on_offset + SemiMonthEnd.__call__ SemiMonthBegin -------------- @@ -482,6 +496,7 @@ Methods SemiMonthBegin.onOffset SemiMonthBegin.is_anchored SemiMonthBegin.is_on_offset + SemiMonthBegin.__call__ Week ---- @@ -514,6 +529,7 @@ Methods Week.onOffset Week.is_anchored Week.is_on_offset + Week.__call__ WeekOfMonth ----------- @@ -545,6 +561,7 @@ Methods WeekOfMonth.onOffset WeekOfMonth.is_anchored WeekOfMonth.is_on_offset + WeekOfMonth.__call__ LastWeekOfMonth --------------- @@ -576,6 +593,7 @@ Methods LastWeekOfMonth.onOffset LastWeekOfMonth.is_anchored LastWeekOfMonth.is_on_offset + LastWeekOfMonth.__call__ QuarterOffset ------------- @@ -608,6 +626,7 @@ Methods QuarterOffset.onOffset QuarterOffset.is_anchored QuarterOffset.is_on_offset + QuarterOffset.__call__ BQuarterEnd ----------- @@ -640,6 +659,7 @@ Methods BQuarterEnd.onOffset BQuarterEnd.is_anchored BQuarterEnd.is_on_offset + BQuarterEnd.__call__ BQuarterBegin ------------- @@ -672,6 +692,7 @@ Methods BQuarterBegin.onOffset BQuarterBegin.is_anchored BQuarterBegin.is_on_offset + BQuarterBegin.__call__ QuarterEnd ---------- @@ -704,6 +725,7 @@ Methods QuarterEnd.onOffset QuarterEnd.is_anchored QuarterEnd.is_on_offset + QuarterEnd.__call__ QuarterBegin ------------ @@ -736,6 +758,7 @@ Methods QuarterBegin.onOffset QuarterBegin.is_anchored QuarterBegin.is_on_offset + QuarterBegin.__call__ YearOffset ---------- @@ -768,6 +791,7 @@ Methods YearOffset.onOffset YearOffset.is_anchored YearOffset.is_on_offset + YearOffset.__call__ BYearEnd -------- @@ -800,6 +824,7 @@ Methods BYearEnd.onOffset BYearEnd.is_anchored BYearEnd.is_on_offset + BYearEnd.__call__ BYearBegin ---------- @@ -832,6 +857,7 @@ Methods BYearBegin.onOffset BYearBegin.is_anchored BYearBegin.is_on_offset + BYearBegin.__call__ YearEnd ------- @@ -864,6 +890,7 @@ Methods YearEnd.onOffset YearEnd.is_anchored YearEnd.is_on_offset + YearEnd.__call__ YearBegin --------- @@ -896,6 +923,7 @@ Methods YearBegin.onOffset YearBegin.is_anchored YearBegin.is_on_offset + YearBegin.__call__ FY5253 ------ @@ -929,6 +957,7 @@ Methods FY5253.onOffset FY5253.is_anchored FY5253.is_on_offset + FY5253.__call__ FY5253Quarter ------------- @@ -962,6 +991,7 @@ Methods FY5253Quarter.is_anchored FY5253Quarter.is_on_offset FY5253Quarter.year_has_extra_week + FY5253Quarter.__call__ Easter ------ @@ -993,6 +1023,7 @@ Methods Easter.onOffset Easter.is_anchored Easter.is_on_offset + Easter.__call__ Tick ---- @@ -1024,6 +1055,7 @@ Methods Tick.onOffset Tick.is_anchored Tick.is_on_offset + Tick.__call__ Day --- @@ -1055,6 +1087,7 @@ Methods Day.onOffset Day.is_anchored Day.is_on_offset + Day.__call__ Hour ---- @@ -1086,6 +1119,7 @@ Methods Hour.onOffset Hour.is_anchored Hour.is_on_offset + Hour.__call__ Minute ------ @@ -1117,6 +1151,7 @@ Methods Minute.onOffset Minute.is_anchored Minute.is_on_offset + Minute.__call__ Second ------ @@ -1148,6 +1183,7 @@ Methods Second.onOffset Second.is_anchored Second.is_on_offset + Second.__call__ Milli ----- @@ -1179,6 +1215,7 @@ Methods Milli.onOffset Milli.is_anchored Milli.is_on_offset + Milli.__call__ Micro ----- @@ -1210,6 +1247,7 @@ Methods Micro.onOffset Micro.is_anchored Micro.is_on_offset + Micro.__call__ Nano ---- @@ -1241,6 +1279,7 @@ Methods Nano.onOffset Nano.is_anchored Nano.is_on_offset + Nano.__call__ BDay ---- @@ -1277,6 +1316,7 @@ Methods BDay.is_on_offset BDay.rollback BDay.rollforward + BDay.__call__ BMonthEnd --------- @@ -1312,6 +1352,7 @@ Methods BMonthEnd.is_on_offset BMonthEnd.rollback BMonthEnd.rollforward + BMonthEnd.__call__ BMonthBegin ----------- @@ -1347,6 +1388,7 @@ Methods BMonthBegin.is_on_offset BMonthBegin.rollback BMonthBegin.rollforward + BMonthBegin.__call__ CBMonthEnd ---------- @@ -1386,6 +1428,7 @@ Methods CBMonthEnd.is_on_offset CBMonthEnd.rollback CBMonthEnd.rollforward + CBMonthEnd.__call__ CBMonthBegin ------------ @@ -1425,6 +1468,7 @@ Methods CBMonthBegin.is_on_offset CBMonthBegin.rollback CBMonthBegin.rollforward + CBMonthBegin.__call__ CDay ---- @@ -1461,6 +1505,7 @@ Methods CDay.is_on_offset CDay.rollback CDay.rollforward + CDay.__call__ .. _api.frequencies: diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 1a69fa076dbf0..ab0540a930396 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -110,7 +110,7 @@ Binary operator functions Series.product Series.dot -Function application, groupby & window +Function application, GroupBy & window -------------------------------------- .. autosummary:: :toctree: api/ @@ -249,7 +249,7 @@ Combining / joining / merging Series.replace Series.update -Time series-related +Time Series-related ------------------- .. autosummary:: :toctree: api/ diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst index 3db1aa12a4275..fb60a0d387ca2 100644 --- a/doc/source/reference/window.rst +++ b/doc/source/reference/window.rst @@ -75,7 +75,7 @@ Exponentially-weighted moving window functions EWM.corr EWM.cov -Window Indexer +Window indexer -------------- .. currentmodule:: pandas @@ -85,3 +85,4 @@ Base class for defining custom window boundaries. :toctree: api/ api.indexers.BaseIndexer + api.indexers.FixedForwardWindowIndexer diff --git a/doc/source/getting_started/10min.rst b/doc/source/user_guide/10min.rst similarity index 95% rename from doc/source/getting_started/10min.rst rename to doc/source/user_guide/10min.rst index a635b5656bd2d..9994287c827e3 100644 --- a/doc/source/getting_started/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -39,7 +39,7 @@ and labeled columns: df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')) df -Creating a ``DataFrame`` by passing a dict of objects that can be converted to series-like. +Creating a :class:`DataFrame` by passing a dict of objects that can be converted to series-like. .. ipython:: python @@ -51,7 +51,7 @@ Creating a ``DataFrame`` by passing a dict of objects that can be converted to s 'F': 'foo'}) df2 -The columns of the resulting ``DataFrame`` have different +The columns of the resulting :class:`DataFrame` have different :ref:`dtypes `. .. ipython:: python @@ -169,7 +169,7 @@ See the indexing documentation :ref:`Indexing and Selecting Data ` and Getting ~~~~~~~ -Selecting a single column, which yields a ``Series``, +Selecting a single column, which yields a :class:`Series`, equivalent to ``df.A``: .. ipython:: python @@ -469,10 +469,10 @@ Concatenating pandas objects together with :func:`concat`: pd.concat(pieces) .. note:: - Adding a column to a ``DataFrame`` is relatively fast. However, adding + Adding a column to a :class:`DataFrame` is relatively fast. However, adding a row requires a copy, and may be expensive. We recommend passing a - pre-built list of records to the ``DataFrame`` constructor instead - of building a ``DataFrame`` by iteratively appending records to it. + pre-built list of records to the :class:`DataFrame` constructor instead + of building a :class:`DataFrame` by iteratively appending records to it. See :ref:`Appending to dataframe ` for more. Join @@ -520,7 +520,7 @@ See the :ref:`Grouping section `. 'D': np.random.randn(8)}) df -Grouping and then applying the :meth:`~DataFrame.sum` function to the resulting +Grouping and then applying the :meth:`~pandas.core.groupby.GroupBy.sum` function to the resulting groups. .. ipython:: python @@ -528,7 +528,7 @@ groups. df.groupby('A').sum() Grouping by multiple columns forms a hierarchical index, and again we can -apply the ``sum`` function. +apply the :meth:`~pandas.core.groupby.GroupBy.sum` function. .. ipython:: python @@ -648,7 +648,7 @@ the quarter end: Categoricals ------------ -pandas can include categorical data in a ``DataFrame``. For full docs, see the +pandas can include categorical data in a :class:`DataFrame`. For full docs, see the :ref:`categorical introduction ` and the :ref:`API documentation `. .. ipython:: python @@ -664,14 +664,13 @@ Convert the raw grades to a categorical data type. df["grade"] Rename the categories to more meaningful names (assigning to -``Series.cat.categories`` is inplace!). +:meth:`Series.cat.categories` is inplace!). .. ipython:: python df["grade"].cat.categories = ["very good", "good", "very bad"] -Reorder the categories and simultaneously add the missing categories (methods under ``Series -.cat`` return a new ``Series`` by default). +Reorder the categories and simultaneously add the missing categories (methods under :meth:`Series.cat` return a new :class:`Series` by default). .. ipython:: python diff --git a/doc/source/getting_started/basics.rst b/doc/source/user_guide/basics.rst similarity index 99% rename from doc/source/getting_started/basics.rst rename to doc/source/user_guide/basics.rst index c6d9a48fcf8ed..055b43bc1e59b 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -7,8 +7,8 @@ ============================== Here we discuss a lot of the essential functionality common to the pandas data -structures. Here's how to create some of the objects used in the examples from -the previous section: +structures. To begin, let's create some example objects like we did in +the :ref:`10 minutes to pandas <10min>` section: .. ipython:: python @@ -1224,8 +1224,6 @@ following can be done: This means that the reindexed Series's index is the same Python object as the DataFrame's index. -.. versionadded:: 0.21.0 - :meth:`DataFrame.reindex` also supports an "axis-style" calling convention, where you specify a single ``labels`` argument and the ``axis`` it applies to. @@ -1435,8 +1433,6 @@ Series can also be used: If the mapping doesn't include a column/index label, it isn't renamed. Note that extra labels in the mapping don't throw an error. -.. versionadded:: 0.21.0 - :meth:`DataFrame.rename` also supports an "axis-style" calling convention, where you specify a single ``mapper`` and the ``axis`` to apply that mapping to. diff --git a/doc/source/user_guide/boolean.rst b/doc/source/user_guide/boolean.rst index 6370a523b9a0d..d690c1093399a 100644 --- a/doc/source/user_guide/boolean.rst +++ b/doc/source/user_guide/boolean.rst @@ -39,7 +39,7 @@ If you would prefer to keep the ``NA`` values you can manually fill them with `` .. _boolean.kleene: -Kleene Logical Operations +Kleene logical operations ------------------------- :class:`arrays.BooleanArray` implements `Kleene Logic`_ (sometimes called three-value logic) for diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index a55326db748fd..7def45ddc13e2 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -211,8 +211,6 @@ To get back to the original ``Series`` or NumPy array, use CategoricalDtype ---------------- -.. versionchanged:: 0.21.0 - A categorical's type is fully described by 1. ``categories``: a sequence of unique values and no missing values @@ -799,7 +797,7 @@ Assigning a ``Categorical`` to parts of a column of other types will use the val .. _categorical.merge: .. _categorical.concat: -Merging / Concatenation +Merging / concatenation ~~~~~~~~~~~~~~~~~~~~~~~ By default, combining ``Series`` or ``DataFrames`` which contain the same diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 9951642ca98a4..af2f02a09428b 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -210,7 +210,7 @@ parameter: .. _stats.moments: -Window Functions +Window functions ---------------- .. currentmodule:: pandas.core.window @@ -323,7 +323,7 @@ We provide a number of common statistical functions: .. _stats.rolling_apply: -Rolling Apply +Rolling apply ~~~~~~~~~~~~~ The :meth:`~Rolling.apply` function takes an extra ``func`` argument and performs @@ -571,6 +571,20 @@ and we want to use an expanding window where ``use_expanding`` is ``True`` other 3 3.0 4 10.0 +.. versionadded:: 1.1 + +For some problems knowledge of the future is available for analysis. For example, this occurs when +each data point is a full time series read from an experiment, and the task is to extract underlying +conditions. In these cases it can be useful to perform forward-looking rolling window computations. +:func:`FixedForwardWindowIndexer ` class is available for this purpose. +This :func:`BaseIndexer ` subclass implements a closed fixed-width +forward-looking rolling window, and we can use it as follows: + +.. ipython:: ipython + + from pandas.api.indexers import FixedForwardWindowIndexer + indexer = FixedForwardWindowIndexer(window_size=2) + df.rolling(indexer, min_periods=1).sum() .. _stats.rolling_window.endpoints: diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 4afdb14e5c39e..992cdfa5d7332 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -179,7 +179,7 @@ One could hard code: Selection --------- -DataFrames +Dataframes ********** The :ref:`indexing ` docs. @@ -290,7 +290,7 @@ Notice the same results, with the exception of the index. .. _cookbook.multi_index: -MultiIndexing +Multiindexing ------------- The :ref:`multindexing ` docs. @@ -794,8 +794,7 @@ The :ref:`Resample ` docs. `Time grouping with some missing values `__ -`Valid frequency arguments to Grouper -`__ +Valid frequency arguments to Grouper :ref:`Timeseries ` `Grouping using a MultiIndex `__ @@ -914,7 +913,7 @@ The :ref:`Plotting ` docs. @savefig quartile_boxplot.png df.boxplot(column='price', by='quartiles') -Data In/Out +Data in/out ----------- `Performance comparison of SQL vs HDF5 diff --git a/doc/source/getting_started/dsintro.rst b/doc/source/user_guide/dsintro.rst similarity index 97% rename from doc/source/getting_started/dsintro.rst rename to doc/source/user_guide/dsintro.rst index 200d567a62732..075787d3b9d5b 100644 --- a/doc/source/getting_started/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -397,6 +397,28 @@ The result will be a DataFrame with the same index as the input Series, and with one column whose name is the original name of the Series (only if no other column name provided). +.. _basics.dataframe.from_list_dataclasses: + +From a list of dataclasses +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 1.1.0 + +Data Classes as introduced in `PEP557 `__, +can be passed into the DataFrame constructor. +Passing a list of dataclasses is equivalent to passing a list of dictionaries. + +Please be aware, that that all values in the list should be dataclasses, mixing +types in the list would result in a TypeError. + +.. ipython:: python + + from dataclasses import make_dataclass + + Point = make_dataclass("Point", [("x", int), ("y", int)]) + + pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)]) + **Missing data** Much more will be said on this topic in the :ref:`Missing data ` diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index f9a72b87e58d8..4c691ebb252e7 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -317,7 +317,7 @@ See `this link ` for more. Piping function calls ~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.21.0 - Similar to the functionality provided by ``DataFrame`` and ``Series``, functions that take ``GroupBy`` objects can be chained together using a ``pipe`` method to allow for a cleaner, more readable syntax. To read about ``.pipe`` in general terms, diff --git a/doc/source/user_guide/index.rst b/doc/source/user_guide/index.rst index 30b1c0b4eac0d..8226e72779588 100644 --- a/doc/source/user_guide/index.rst +++ b/doc/source/user_guide/index.rst @@ -12,6 +12,8 @@ pandas approaches the problem, with many examples throughout. Users brand-new to pandas should start with :ref:`10min`. +For a high level summary of the pandas fundamentals, see :ref:`dsintro` and :ref:`basics`. + Further information on any specific method can be obtained in the :ref:`api`. @@ -21,6 +23,9 @@ Further information on any specific method can be obtained in the .. toctree:: :maxdepth: 2 + 10min + dsintro + basics io indexing advanced diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 2bd3ff626f2e1..fb815b3a975d1 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -374,7 +374,7 @@ For getting values with a boolean array: df1.loc['a'] > 0 df1.loc[:, df1.loc['a'] > 0] -NA values in a boolean array propogate as ``False``: +NA values in a boolean array propagate as ``False``: .. versionchanged:: 1.0.2 diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index c34247a49335d..df6b44ac654ce 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -28,7 +28,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like binary;`HDF5 Format `__;:ref:`read_hdf`;:ref:`to_hdf` binary;`Feather Format `__;:ref:`read_feather`;:ref:`to_feather` binary;`Parquet Format `__;:ref:`read_parquet`;:ref:`to_parquet` - binary;`ORC Format `__;:ref:`read_orc`; + binary;`ORC Format `__;:ref:`read_orc`; binary;`Msgpack `__;:ref:`read_msgpack`;:ref:`to_msgpack` binary;`Stata `__;:ref:`read_stata`;:ref:`to_stata` binary;`SAS `__;:ref:`read_sas`; @@ -109,6 +109,11 @@ index_col : int, str, sequence of int / str, or False, default ``None`` Note: ``index_col=False`` can be used to force pandas to *not* use the first column as the index, e.g. when you have a malformed file with delimiters at the end of each line. + + The default value of ``None`` instructs pandas to guess. If the number of + fields in the column header row is equal to the number of fields in the body + of the data file, then a default index is used. If it is one larger, then + the first field is used as an index. usecols : list-like or callable, default ``None`` Return a subset of the columns. If list-like, all elements must either be positional (i.e. integer indices into the document columns) or strings @@ -280,14 +285,18 @@ chunksize : int, default ``None`` Quoting, compression, and file format +++++++++++++++++++++++++++++++++++++ -compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``}, default ``'infer'`` +compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``, ``dict``}, default ``'infer'`` For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, zip, or xz if filepath_or_buffer is a string ending in '.gz', '.bz2', '.zip', or '.xz', respectively, and no decompression otherwise. If using 'zip', the ZIP file must contain only one data file to be read in. - Set to ``None`` for no decompression. + Set to ``None`` for no decompression. Can also be a dict with key ``'method'`` + set to one of {``'zip'``, ``'gzip'``, ``'bz2'``}, and other keys set to + compression settings. As an example, the following could be passed for + faster compression: ``compression={'method': 'gzip', 'compresslevel': 1}``. .. versionchanged:: 0.24.0 'infer' option added and set to default. + .. versionchanged:: 1.1.0 dict option extended to support ``gzip`` and ``bz2``. thousands : str, default ``None`` Thousands separator. decimal : str, default ``'.'`` @@ -461,8 +470,6 @@ specification: pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes -.. versionadded:: 0.21.0 - Specifying ``dtype='category'`` will result in an unordered ``Categorical`` whose ``categories`` are the unique values observed in the data. For more control on the categories and order, create a @@ -2171,8 +2178,6 @@ Line delimited json pandas is able to read and write line-delimited json files that are common in data processing pipelines using Hadoop or Spark. -.. versionadded:: 0.21.0 - For line-delimited json files, pandas can also return an iterator which reads in ``chunksize`` lines at a time. This can be useful for large files or to read from a stream. .. ipython:: python @@ -3346,6 +3351,12 @@ The compression type can be an explicit parameter or be inferred from the file e If 'infer', then use ``gzip``, ``bz2``, ``zip``, or ``xz`` if filename ends in ``'.gz'``, ``'.bz2'``, ``'.zip'``, or ``'.xz'``, respectively. +The compression parameter can also be a ``dict`` in order to pass options to the +compression protocol. It must have a ``'method'`` key set to the name +of the compression protocol, which must be one of +{``'zip'``, ``'gzip'``, ``'bz2'``}. All other key-value pairs are passed to +the underlying compression library. + .. ipython:: python df = pd.DataFrame({ @@ -3382,6 +3393,15 @@ The default is to 'infer': rt = pd.read_pickle("s1.pkl.bz2") rt +Passing options to the compression protocol in order to speed up compression: + +.. ipython:: python + + df.to_pickle( + "data.pkl.gz", + compression={"method": "gzip", 'compresslevel': 1} + ) + .. ipython:: python :suppress: @@ -4582,17 +4602,15 @@ frames efficient, and to make sharing data across data analysis languages easy. Feather is designed to faithfully serialize and de-serialize DataFrames, supporting all of the pandas dtypes, including extension dtypes such as categorical and datetime with tz. -Several caveats. +Several caveats: -* This is a newer library, and the format, though stable, is not guaranteed to be backward compatible - to the earlier versions. * The format will NOT write an ``Index``, or ``MultiIndex`` for the ``DataFrame`` and will raise an error if a non-default one is provided. You can ``.reset_index()`` to store the index or ``.reset_index(drop=True)`` to ignore it. * Duplicate column names and non-string columns names are not supported -* Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message - on an attempt at serialization. +* Actual Python objects in object dtype columns are not supported. These will + raise a helpful error message on an attempt at serialization. See the `Full Documentation `__. @@ -4646,8 +4664,6 @@ Read from a feather file. Parquet ------- -.. versionadded:: 0.21.0 - `Apache Parquet `__ provides a partitioned binary columnar serialization for data frames. It is designed to make reading and writing data frames efficient, and to make sharing data across data analysis languages easy. Parquet can use a variety of compression techniques to shrink the file size as much as possible @@ -4817,7 +4833,7 @@ ORC .. versionadded:: 1.0.0 -Similar to the :ref:`parquet ` format, the `ORC Format `__ is a binary columnar serialization +Similar to the :ref:`parquet ` format, the `ORC Format `__ is a binary columnar serialization for data frames. It is designed to make reading data frames efficient. Pandas provides *only* a reader for the ORC format, :func:`~pandas.read_orc`. This requires the `pyarrow `__ library. @@ -5005,7 +5021,7 @@ Possible values are: This usually provides better performance for analytic databases like *Presto* and *Redshift*, but has worse performance for traditional SQL backend if the table contains many columns. - For more information check the SQLAlchemy `documention + For more information check the SQLAlchemy `documentation `__. - callable with signature ``(pd_table, conn, keys, data_iter)``: This can be used to implement a more performant insertion method based on diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index 8fdcd8d281a41..0450c81958a51 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -573,8 +573,6 @@ all standard database join operations between ``DataFrame`` or named ``Series`` dataset. * "many_to_many" or "m:m": allowed, but does not result in checks. - .. versionadded:: 0.21.0 - .. note:: Support for specifying index levels as the ``on``, ``left_on``, and @@ -724,6 +722,27 @@ either the left or right tables, the values in the joined table will be labels=['left', 'right'], vertical=False); plt.close('all'); +You can merge a mult-indexed Series and a DataFrame, if the names of +the MultiIndex correspond to the columns from the DataFrame. Transform +the Series to a DataFrame using :meth:`Series.reset_index` before merging, +as shown in the following example. + +.. ipython:: python + + df = pd.DataFrame({"Let": ["A", "B", "C"], "Num": [1, 2, 3]}) + df + + ser = pd.Series( + ["a", "b", "c", "d", "e", "f"], + index=pd.MultiIndex.from_arrays( + [["A", "B", "C"] * 2, [1, 2, 3, 4, 5, 6]], names=["Let", "Num"] + ), + ) + ser + + pd.merge(df, ser.reset_index(), on=['Let', 'Num']) + + Here is another example with duplicate join keys in DataFrames: .. ipython:: python @@ -752,8 +771,6 @@ Here is another example with duplicate join keys in DataFrames: Checking for duplicate keys ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.21.0 - Users can use the ``validate`` argument to automatically check whether there are unexpected duplicates in their merge keys. Key uniqueness is checked before merge operations and so should protect against memory overflows. Checking key diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst index 5817efb31814e..398336960e769 100644 --- a/doc/source/user_guide/options.rst +++ b/doc/source/user_guide/options.rst @@ -140,7 +140,7 @@ More information can be found in the `ipython documentation .. _options.frequently_used: -Frequently Used Options +Frequently used options ----------------------- The following is a walk-through of the more frequently used display options. diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 58733b852e3a1..7e890962d8da1 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -272,7 +272,7 @@ the right thing: .. _reshaping.melt: -Reshaping by Melt +Reshaping by melt ----------------- .. image:: ../_static/reshaping_melt.png diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index 43bb4966ec5bf..cddc3cb2600fd 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -246,6 +246,7 @@ We'll import ``dask.dataframe`` and notice that the API feels similar to pandas. We can use Dask's ``read_parquet`` function, but provide a globstring of files to read in. .. ipython:: python + :okwarning: import dask.dataframe as dd @@ -258,7 +259,7 @@ Inspecting the ``ddf`` object, we see a few things * There are familiar methods like ``.groupby``, ``.sum``, etc. * There are new attributes like ``.npartitions`` and ``.divisions`` -The partitions and divisions are how Dask parallizes computation. A **Dask** +The partitions and divisions are how Dask parallelizes computation. A **Dask** DataFrame is made up of many **Pandas** DataFrames. A single method call on a Dask DataFrame ends up making many pandas method calls, and Dask knows how to coordinate everything to get the result. diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 1f2f8818c8458..fd8dda4fe365e 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -620,8 +620,8 @@ "aligns = ['left','zero','mid']\n", "for align in aligns:\n", " row = \"{}\".format(align)\n", - " for serie in [test1,test2,test3]:\n", - " s = serie.copy()\n", + " for series in [test1,test2,test3]:\n", + " s = series.copy()\n", " s.name=''\n", " row += \"{}\".format(s.to_frame().style.bar(align=align, \n", " color=['#d65f5f', '#5fba7d'], \n", diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 2e4d0fecaf5cf..bea0f42f6849c 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -8,7 +8,7 @@ Working with text data .. _text.types: -Text Data Types +Text data types --------------- .. versionadded:: 1.0.0 @@ -113,7 +113,7 @@ Everything else that follows in the rest of this document applies equally to .. _text.string_methods: -String Methods +String methods -------------- Series and Index are equipped with a set of string processing methods @@ -633,7 +633,7 @@ same result as a ``Series.str.extractall`` with a default index (starts from 0). pd.Series(["a1a2", "b1", "c1"], dtype="string").str.extractall(two_groups) -Testing for Strings that match or contain a pattern +Testing for strings that match or contain a pattern --------------------------------------------------- You can check whether elements contain a pattern: @@ -641,21 +641,40 @@ You can check whether elements contain a pattern: .. ipython:: python pattern = r'[0-9][a-z]' - pd.Series(['1', '2', '3a', '3b', '03c'], + pd.Series(['1', '2', '3a', '3b', '03c', '4dx'], dtype="string").str.contains(pattern) Or whether elements match a pattern: .. ipython:: python - pd.Series(['1', '2', '3a', '3b', '03c'], + pd.Series(['1', '2', '3a', '3b', '03c', '4dx'], dtype="string").str.match(pattern) -The distinction between ``match`` and ``contains`` is strictness: ``match`` -relies on strict ``re.match``, while ``contains`` relies on ``re.search``. +.. versionadded:: 1.1.0 -Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take -an extra ``na`` argument so missing values can be considered True or False: +.. ipython:: python + + pd.Series(['1', '2', '3a', '3b', '03c', '4dx'], + dtype="string").str.fullmatch(pattern) + +.. note:: + + The distinction between ``match``, ``fullmatch``, and ``contains`` is strictness: + ``fullmatch`` tests whether the entire string matches the regular expression; + ``match`` tests whether there is a match of the regular expression that begins + at the first character of the string; and ``contains`` tests whether there is + a match of the regular expression at any position within the string. + + The corresponding functions in the ``re`` package for these three match modes are + `re.fullmatch `_, + `re.match `_, and + `re.search `_, + respectively. + +Methods like ``match``, ``fullmatch``, ``contains``, ``startswith``, and +``endswith`` take an extra ``na`` argument so missing values can be considered +True or False: .. ipython:: python diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index f208c8d576131..a09a5576ca378 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -122,7 +122,7 @@ as ``np.nan`` does for float data. .. _timeseries.representation: -Timestamps vs. Time Spans +Timestamps vs. time spans ------------------------- Timestamped data is the most basic type of time series data that associates @@ -772,6 +772,7 @@ There are several time/date properties that one can access from ``Timestamp`` or week,"The week ordinal of the year" dayofweek,"The number of the day of the week with Monday=0, Sunday=6" weekday,"The number of the day of the week with Monday=0, Sunday=6" + isocalendar,"The ISO 8601 year, week and day of the date" quarter,"Quarter of the date: Jan-Mar = 1, Apr-Jun = 2, etc." days_in_month,"The number of days in the month of the datetime" is_month_start,"Logical indicating if first day of month (defined by frequency)" @@ -786,6 +787,15 @@ Furthermore, if you have a ``Series`` with datetimelike values, then you can access these properties via the ``.dt`` accessor, as detailed in the section on :ref:`.dt accessors`. +.. versionadded:: 1.1.0 + +You may obtain the year, week and day components of the ISO year from the ISO 8601 standard: + +.. ipython:: python + + idx = pd.date_range(start='2019-12-29', freq='D', periods=4) + idx.to_series().dt.isocalendar + .. _timeseries.offsets: DateOffset objects @@ -1434,7 +1444,7 @@ or calendars with additional rules. .. _timeseries.advanced_datetime: -Time Series-Related Instance Methods +Time series-related instance methods ------------------------------------ Shifting / lagging diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 756dd06aced7f..451ddf046416e 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -796,7 +796,7 @@ before plotting. .. _visualization.tools: -Plotting Tools +Plotting tools -------------- These functions can be imported from ``pandas.plotting`` @@ -1045,7 +1045,7 @@ for more information. .. _visualization.formatting: -Plot Formatting +Plot formatting --------------- Setting the plot style diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 68aabfe76d8de..b5ac96752536e 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -3,11 +3,11 @@ {{ header }} ************* -Release Notes +Release notes ************* This is the list of changes to pandas between each release. For full details, -see the commit logs at https://github.com/pandas-dev/pandas. For install and +see the `commit logs `_. For install and upgrade instructions, see :ref:`install`. Version 1.1 @@ -24,9 +24,10 @@ Version 1.0 .. toctree:: :maxdepth: 2 - v1.0.0 - v1.0.1 + v1.0.3 v1.0.2 + v1.0.1 + v1.0.0 Version 0.25 ------------ diff --git a/doc/source/whatsnew/v0.10.0.rst b/doc/source/whatsnew/v0.10.0.rst index 2e0442364b2f3..443250592a4a7 100644 --- a/doc/source/whatsnew/v0.10.0.rst +++ b/doc/source/whatsnew/v0.10.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_0100: -v0.10.0 (December 17, 2012) ---------------------------- +Version 0.10.0 (December 17, 2012) +---------------------------------- {{ header }} @@ -490,7 +490,7 @@ Updated PyTables support however, query terms using the prior (undocumented) methodology are unsupported. You must read in the entire file and write it out using the new format to take advantage of the updates. -N dimensional Panels (experimental) +N dimensional panels (experimental) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Adding experimental support for Panel4D and factory functions to create n-dimensional named panels. diff --git a/doc/source/whatsnew/v0.10.1.rst b/doc/source/whatsnew/v0.10.1.rst index c4251f70d85b6..1e9eafd2700e9 100644 --- a/doc/source/whatsnew/v0.10.1.rst +++ b/doc/source/whatsnew/v0.10.1.rst @@ -1,7 +1,7 @@ .. _whatsnew_0101: -v0.10.1 (January 22, 2013) ---------------------------- +Version 0.10.1 (January 22, 2013) +--------------------------------- {{ header }} diff --git a/doc/source/whatsnew/v0.11.0.rst b/doc/source/whatsnew/v0.11.0.rst index 148ee349b049c..6c13a125a4e54 100644 --- a/doc/source/whatsnew/v0.11.0.rst +++ b/doc/source/whatsnew/v0.11.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_0110: -v0.11.0 (April 22, 2013) ------------------------- +Version 0.11.0 (April 22, 2013) +------------------------------- {{ header }} diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst index 823e177f3e05e..9e864f63c43e0 100644 --- a/doc/source/whatsnew/v0.12.0.rst +++ b/doc/source/whatsnew/v0.12.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_0120: -v0.12.0 (July 24, 2013) ------------------------- +Version 0.12.0 (July 24, 2013) +------------------------------ {{ header }} @@ -177,8 +177,8 @@ API changes ``__repr__``). Plus string safety throughout. Now employed in many places throughout the pandas library. (:issue:`4090`, :issue:`4092`) -I/O enhancements -~~~~~~~~~~~~~~~~ +IO enhancements +~~~~~~~~~~~~~~~ - ``pd.read_html()`` can now parse HTML strings, files or urls and return DataFrames, courtesy of @cpcloud. (:issue:`3477`, :issue:`3605`, :issue:`3606`, :issue:`3616`). diff --git a/doc/source/whatsnew/v0.13.0.rst b/doc/source/whatsnew/v0.13.0.rst index de5e1986744fe..5a904d6c85c61 100644 --- a/doc/source/whatsnew/v0.13.0.rst +++ b/doc/source/whatsnew/v0.13.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_0130: -v0.13.0 (January 3, 2014) ---------------------------- +Version 0.13.0 (January 3, 2014) +-------------------------------- {{ header }} diff --git a/doc/source/whatsnew/v0.13.1.rst b/doc/source/whatsnew/v0.13.1.rst index 4f9ab761334e7..6fe010be8fb2d 100644 --- a/doc/source/whatsnew/v0.13.1.rst +++ b/doc/source/whatsnew/v0.13.1.rst @@ -1,7 +1,7 @@ .. _whatsnew_0131: -v0.13.1 (February 3, 2014) --------------------------- +Version 0.13.1 (February 3, 2014) +--------------------------------- {{ header }} diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst index 25a75492d78fb..0041f6f03afef 100644 --- a/doc/source/whatsnew/v0.14.0.rst +++ b/doc/source/whatsnew/v0.14.0.rst @@ -473,7 +473,7 @@ Some other enhancements to the sql functions include: .. _whatsnew_0140.slicers: -MultiIndexing using slicers +Multiindexing using slicers ~~~~~~~~~~~~~~~~~~~~~~~~~~~ In 0.14.0 we added a new way to slice MultiIndexed objects. @@ -904,7 +904,7 @@ There are no experimental changes in 0.14.0 .. _whatsnew_0140.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ - Bug in Series ValueError when index doesn't match data (:issue:`6532`) diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst index 95e354e425143..fc190908bdc07 100644 --- a/doc/source/whatsnew/v0.15.0.rst +++ b/doc/source/whatsnew/v0.15.0.rst @@ -600,7 +600,7 @@ Rolling/expanding moments improvements .. _whatsnew_0150.sql: -Improvements in the sql io module +Improvements in the SQL io module ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - Added support for a ``chunksize`` parameter to ``to_sql`` function. This allows DataFrame to be written in chunks and avoid packet-size overflow errors (:issue:`8062`). diff --git a/doc/source/whatsnew/v0.18.0.rst b/doc/source/whatsnew/v0.18.0.rst index d3f96d4185d65..e371f1d9fe69a 100644 --- a/doc/source/whatsnew/v0.18.0.rst +++ b/doc/source/whatsnew/v0.18.0.rst @@ -1197,7 +1197,7 @@ Performance improvements .. _whatsnew_0180.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ - Bug in ``GroupBy.size`` when data-frame is empty. (:issue:`11699`) diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst index f786ce513f6fe..2c6e8f0e27154 100644 --- a/doc/source/whatsnew/v0.18.1.rst +++ b/doc/source/whatsnew/v0.18.1.rst @@ -380,7 +380,7 @@ New behavior: .. _whatsnew_0181.numpy_compatibility: -numpy function compatibility +NumPy function compatibility ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Compatibility between pandas array-like methods (e.g. ``sum`` and ``take``) and their ``numpy`` diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index 6eb509a258430..7390b80217b2c 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -377,7 +377,7 @@ For ``MultiIndex``, values are dropped if any level is missing by default. Speci .. _whatsnew_0190.gbq: -Google BigQuery Enhancements +Google BigQuery enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - The :func:`read_gbq` method has gained the ``dialect`` argument to allow users to specify whether to use BigQuery's legacy SQL or BigQuery's standard SQL. See the `docs `__ for more details (:issue:`13615`). @@ -385,7 +385,7 @@ Google BigQuery Enhancements .. _whatsnew_0190.errstate: -Fine-grained numpy errstate +Fine-grained NumPy errstate ^^^^^^^^^^^^^^^^^^^^^^^^^^^ Previous versions of pandas would permanently silence numpy's ufunc error handling when ``pandas`` was imported. Pandas did this in order to silence the warnings that would arise from using numpy ufuncs on missing data, which are usually represented as ``NaN`` s. Unfortunately, this silenced legitimate warnings arising in non-pandas code in the application. Starting with 0.19.0, pandas will use the ``numpy.errstate`` context manager to silence these warnings in a more fine-grained manner, only around where these operations are actually used in the pandas code base. (:issue:`13109`, :issue:`13145`) @@ -1185,7 +1185,7 @@ the result of calling :func:`read_csv` without the ``chunksize=`` argument .. _whatsnew_0190.sparse: -Sparse Changes +Sparse changes ^^^^^^^^^^^^^^ These changes allow pandas to handle sparse data with more dtypes, and for work to make a smoother experience with data handling. diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index ceb1c7f27231b..06bbd9679bb4d 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -356,7 +356,7 @@ To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you .. _whatsnew_0200.enhancements.style_excel: -Excel output for styled DataFrames +Excel output for styled dataframes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Experimental support has been added to export ``DataFrame.style`` formats to Excel using the ``openpyxl`` engine. (:issue:`15530`) @@ -813,7 +813,7 @@ New behavior: .. _whatsnew_0200.api_breaking.gbq: -Pandas Google BigQuery support has moved +pandas Google BigQuery support has moved ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ pandas has split off Google BigQuery support into a separate package ``pandas-gbq``. You can ``conda install pandas-gbq -c conda-forge`` or @@ -1289,7 +1289,7 @@ A new public ``pandas.plotting`` module has been added that holds plotting funct .. _whatsnew_0200.privacy.development: -Other Development Changes +Other development changes ^^^^^^^^^^^^^^^^^^^^^^^^^ - Building pandas for development now requires ``cython >= 0.23`` (:issue:`14831`) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 85de0150a5a28..45399792baecf 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -397,7 +397,7 @@ Other enhancements - :func:`~DataFrame.to_csv`, :func:`~Series.to_csv`, :func:`~DataFrame.to_json`, and :func:`~Series.to_json` now support ``compression='infer'`` to infer compression based on filename extension (:issue:`15008`). The default compression for ``to_csv``, ``to_json``, and ``to_pickle`` methods has been updated to ``'infer'`` (:issue:`22004`). - :meth:`DataFrame.to_sql` now supports writing ``TIMESTAMP WITH TIME ZONE`` types for supported databases. For databases that don't support timezones, datetime data will be stored as timezone unaware local timestamps. See the :ref:`io.sql_datetime_data` for implications (:issue:`9086`). -- :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`) +- :func:`to_timedelta` now supports iso-formatted timedelta strings (:issue:`21877`) - :class:`Series` and :class:`DataFrame` now support :class:`Iterable` objects in the constructor (:issue:`2193`) - :class:`DatetimeIndex` has gained the :attr:`DatetimeIndex.timetz` attribute. This returns the local time with timezone information. (:issue:`21358`) - :meth:`~Timestamp.round`, :meth:`~Timestamp.ceil`, and :meth:`~Timestamp.floor` for :class:`DatetimeIndex` and :class:`Timestamp` @@ -733,7 +733,7 @@ is the case with :attr:`Period.end_time`, for example .. _whatsnew_0240.api_breaking.datetime_unique: -Series.unique for Timezone-Aware Data +Series.unique for timezone-aware data ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The return type of :meth:`Series.unique` for datetime with timezone values has changed @@ -1131,7 +1131,7 @@ data is incompatible with a passed ``dtype=`` (:issue:`15832`) .. _whatsnew_0240.api.concat_categorical: -Concatenation Changes +Concatenation changes ^^^^^^^^^^^^^^^^^^^^^ Calling :func:`pandas.concat` on a ``Categorical`` of ints with NA values now diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index b18d022349001..44558fd63ba15 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -85,7 +85,7 @@ See :ref:`groupby.aggregate.named` for more. .. _whatsnew_0250.enhancements.multiple_lambdas: -Groupby Aggregation with multiple lambdas +Groupby aggregation with multiple lambdas ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ You can now provide multiple lambda functions to a list-like aggregation in @@ -1243,7 +1243,7 @@ Sparse - Bug in :func:`numpy.modf` on a :class:`SparseArray`. Now a tuple of :class:`SparseArray` is returned (:issue:`26946`). -Build Changes +Build changes ^^^^^^^^^^^^^ - Fix install error with PyPy on macOS (:issue:`26536`) diff --git a/doc/source/whatsnew/v0.4.x.rst b/doc/source/whatsnew/v0.4.x.rst index 8e41e528f5b75..0ed7bb396674e 100644 --- a/doc/source/whatsnew/v0.4.x.rst +++ b/doc/source/whatsnew/v0.4.x.rst @@ -1,7 +1,7 @@ .. _whatsnew_04x: -v.0.4.1 through v0.4.3 (September 25 - October 9, 2011) -------------------------------------------------------- +Versions 0.4.1 through 0.4.3 (September 25 - October 9, 2011) +------------------------------------------------------------- {{ header }} diff --git a/doc/source/whatsnew/v0.5.0.rst b/doc/source/whatsnew/v0.5.0.rst index 37c52ac7bb34e..7ccb141260f18 100644 --- a/doc/source/whatsnew/v0.5.0.rst +++ b/doc/source/whatsnew/v0.5.0.rst @@ -1,8 +1,8 @@ .. _whatsnew_050: -v.0.5.0 (October 24, 2011) --------------------------- +Version 0.5.0 (October 24, 2011) +-------------------------------- {{ header }} diff --git a/doc/source/whatsnew/v0.6.0.rst b/doc/source/whatsnew/v0.6.0.rst index 973ba897b3234..f984b9ad71b63 100644 --- a/doc/source/whatsnew/v0.6.0.rst +++ b/doc/source/whatsnew/v0.6.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_060: -v.0.6.0 (November 25, 2011) ---------------------------- +Version 0.6.0 (November 25, 2011) +--------------------------------- {{ header }} diff --git a/doc/source/whatsnew/v0.6.1.rst b/doc/source/whatsnew/v0.6.1.rst index d01757775d694..8eea0a07f1f79 100644 --- a/doc/source/whatsnew/v0.6.1.rst +++ b/doc/source/whatsnew/v0.6.1.rst @@ -1,8 +1,8 @@ .. _whatsnew_061: -v.0.6.1 (December 13, 2011) ---------------------------- +Version 0.6.1 (December 13, 2011) +--------------------------------- New features ~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.7.0.rst b/doc/source/whatsnew/v0.7.0.rst index a63cd37e47dc2..a193b8049e951 100644 --- a/doc/source/whatsnew/v0.7.0.rst +++ b/doc/source/whatsnew/v0.7.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_0700: -v.0.7.0 (February 9, 2012) --------------------------- +Version 0.7.0 (February 9, 2012) +-------------------------------- {{ header }} diff --git a/doc/source/whatsnew/v0.7.1.rst b/doc/source/whatsnew/v0.7.1.rst index 04b548a93c338..7082ef8ed2882 100644 --- a/doc/source/whatsnew/v0.7.1.rst +++ b/doc/source/whatsnew/v0.7.1.rst @@ -1,7 +1,7 @@ .. _whatsnew_0701: -v.0.7.1 (February 29, 2012) ---------------------------- +Version 0.7.1 (February 29, 2012) +--------------------------------- {{ header }} diff --git a/doc/source/whatsnew/v0.7.2.rst b/doc/source/whatsnew/v0.7.2.rst index ad72b081e590c..e10a7b499549b 100644 --- a/doc/source/whatsnew/v0.7.2.rst +++ b/doc/source/whatsnew/v0.7.2.rst @@ -1,7 +1,7 @@ .. _whatsnew_0702: -v.0.7.2 (March 16, 2012) ---------------------------- +Version 0.7.2 (March 16, 2012) +------------------------------ {{ header }} diff --git a/doc/source/whatsnew/v0.7.3.rst b/doc/source/whatsnew/v0.7.3.rst index 020cf3bdc2d59..5ed48c0d8d6d9 100644 --- a/doc/source/whatsnew/v0.7.3.rst +++ b/doc/source/whatsnew/v0.7.3.rst @@ -1,7 +1,7 @@ .. _whatsnew_0703: -v.0.7.3 (April 12, 2012) ------------------------- +Version 0.7.3 (April 12, 2012) +------------------------------ {{ header }} @@ -44,7 +44,7 @@ New features - Add ``kurt`` methods to Series and DataFrame for computing kurtosis -NA Boolean comparison API change +NA boolean comparison API change ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Reverted some changes to how NA values (represented typically as ``NaN`` or diff --git a/doc/source/whatsnew/v0.8.0.rst b/doc/source/whatsnew/v0.8.0.rst index 072d1bae2a2b9..2a49315cc3b12 100644 --- a/doc/source/whatsnew/v0.8.0.rst +++ b/doc/source/whatsnew/v0.8.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_080: -v0.8.0 (June 29, 2012) ------------------------- +Version 0.8.0 (June 29, 2012) +----------------------------- {{ header }} @@ -42,7 +42,7 @@ Bug fixes to the 0.7.x series for legacy NumPy < 1.6 users will be provided as they arise. There will be no more further development in 0.7.x beyond bug fixes. -Time series changes and improvements +Time Series changes and improvements ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. note:: diff --git a/doc/source/whatsnew/v0.8.1.rst b/doc/source/whatsnew/v0.8.1.rst index 1e6b9746c85a5..a00a57a0a1cdb 100644 --- a/doc/source/whatsnew/v0.8.1.rst +++ b/doc/source/whatsnew/v0.8.1.rst @@ -1,7 +1,7 @@ .. _whatsnew_0801: -v0.8.1 (July 22, 2012) ----------------------- +Version 0.8.1 (July 22, 2012) +----------------------------- {{ header }} diff --git a/doc/source/whatsnew/v0.9.0.rst b/doc/source/whatsnew/v0.9.0.rst index 3d9ff3c7a89fd..565b965c116db 100644 --- a/doc/source/whatsnew/v0.9.0.rst +++ b/doc/source/whatsnew/v0.9.0.rst @@ -3,8 +3,8 @@ {{ header }} -v0.9.0 (October 7, 2012) ------------------------- +Version 0.9.0 (October 7, 2012) +------------------------------- This is a major release from 0.8.1 and includes several new features and enhancements along with a large number of bug fixes. New features include diff --git a/doc/source/whatsnew/v0.9.1.rst b/doc/source/whatsnew/v0.9.1.rst index b8932ae2ae522..3b2924d175cdf 100644 --- a/doc/source/whatsnew/v0.9.1.rst +++ b/doc/source/whatsnew/v0.9.1.rst @@ -1,7 +1,7 @@ .. _whatsnew_0901: -v0.9.1 (November 14, 2012) --------------------------- +Version 0.9.1 (November 14, 2012) +--------------------------------- {{ header }} diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 6597b764581a4..4f0ca97310d85 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -15,7 +15,7 @@ including other versions of pandas. 1.0. -New Deprecation Policy +New deprecation policy ~~~~~~~~~~~~~~~~~~~~~~ Starting with Pandas 1.0.0, pandas will adopt a variant of `SemVer`_ to @@ -61,7 +61,7 @@ the :ref:`custom window rolling documentation ` .. _whatsnew_100.to_markdown: -Converting to Markdown +Converting to markdown ^^^^^^^^^^^^^^^^^^^^^^ We've added :meth:`~DataFrame.to_markdown` for creating a markdown table (:issue:`11052`) @@ -746,7 +746,7 @@ Optional libraries below the lowest tested version may still work, but are not c See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. -Build Changes +Build changes ^^^^^^^^^^^^^ Pandas has added a `pyproject.toml `_ file and will no longer include @@ -778,7 +778,7 @@ Other API changes .. _whatsnew_100.api.documentation: -Documentation Improvements +Documentation improvements ^^^^^^^^^^^^^^^^^^^^^^^^^^ - Added new section on :ref:`scale` (:issue:`28315`). diff --git a/doc/source/whatsnew/v1.0.1.rst b/doc/source/whatsnew/v1.0.1.rst index ef3bb8161d13f..c42aab6de4cc3 100644 --- a/doc/source/whatsnew/v1.0.1.rst +++ b/doc/source/whatsnew/v1.0.1.rst @@ -16,7 +16,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :class:`DataFrame` setting values with a slice (e.g. ``df[-4:] = 1``) indexing by label instead of position (:issue:`31469`) -- Fixed regression when indexing a ``Series`` or ``DataFrame`` indexed by ``DatetimeIndex`` with a slice containg a :class:`datetime.date` (:issue:`31501`) +- Fixed regression when indexing a ``Series`` or ``DataFrame`` indexed by ``DatetimeIndex`` with a slice containing a :class:`datetime.date` (:issue:`31501`) - Fixed regression in ``DataFrame.__setitem__`` raising an ``AttributeError`` with a :class:`MultiIndex` and a non-monotonic indexer (:issue:`31449`) - Fixed regression in :class:`Series` multiplication when multiplying a numeric :class:`Series` with >10000 elements with a timedelta-like scalar (:issue:`31457`) - Fixed regression in ``.groupby().agg()`` raising an ``AssertionError`` for some reductions like ``min`` on object-dtype columns (:issue:`31522`) diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index 1b6098e6b6ac1..c3f144e2f0cb3 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -1,7 +1,7 @@ .. _whatsnew_102: -What's new in 1.0.2 (February ??, 2020) ---------------------------------------- +What's new in 1.0.2 (March 12, 2020) +------------------------------------ These are the changes in pandas 1.0.2. See :ref:`release` for a full changelog including other versions of pandas. @@ -15,16 +15,35 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- Fixed regression in :meth:`DataFrame.to_excel` when ``columns`` kwarg is passed (:issue:`31677`) -- Fixed regression in :meth:`Series.align` when ``other`` is a DataFrame and ``method`` is not None (:issue:`31785`) -- Fixed regression in :meth:`pandas.core.groupby.RollingGroupby.apply` where the ``raw`` parameter was ignored (:issue:`31754`) -- Fixed regression in :meth:`rolling(..).corr() ` when using a time offset (:issue:`31789`) -- Fixed regression in :meth:`DataFrameGroupBy.nunique` which was modifying the original values if ``NaN`` values were present (:issue:`31950`) +**Groupby** + +- Fixed regression in :meth:`groupby(..).agg() ` which was failing on frames with :class:`MultiIndex` columns and a custom function (:issue:`31777`) +- Fixed regression in ``groupby(..).rolling(..).apply()`` (``RollingGroupby``) where the ``raw`` parameter was ignored (:issue:`31754`) +- Fixed regression in :meth:`rolling(..).corr() ` when using a time offset (:issue:`31789`) +- Fixed regression in :meth:`groupby(..).nunique() ` which was modifying the original values if ``NaN`` values were present (:issue:`31950`) +- Fixed regression in ``DataFrame.groupby`` raising a ``ValueError`` from an internal operation (:issue:`31802`) +- Fixed regression in :meth:`groupby(..).agg() ` calling a user-provided function an extra time on an empty input (:issue:`31760`) + +**I/O** + +- Fixed regression in :meth:`read_csv` in which the ``encoding`` option was not recognized with certain file-like objects (:issue:`31819`) +- Fixed regression in :meth:`DataFrame.to_excel` when the ``columns`` keyword argument is passed (:issue:`31677`) +- Fixed regression in :class:`ExcelFile` where the stream passed into the function was closed by the destructor. (:issue:`31467`) - Fixed regression where :func:`read_pickle` raised a ``UnicodeDecodeError`` when reading a py27 pickle with :class:`MultiIndex` column (:issue:`31988`). + +**Reindexing/alignment** + +- Fixed regression in :meth:`Series.align` when ``other`` is a :class:`DataFrame` and ``method`` is not ``None`` (:issue:`31785`) +- Fixed regression in :meth:`DataFrame.reindex` and :meth:`Series.reindex` when reindexing with (tz-aware) index and ``method=nearest`` (:issue:`26683`) +- Fixed regression in :meth:`DataFrame.reindex_like` on a :class:`DataFrame` subclass raised an ``AssertionError`` (:issue:`31925`) - Fixed regression in :class:`DataFrame` arithmetic operations with mis-matched columns (:issue:`31623`) -- Fixed regression in :meth:`GroupBy.agg` calling a user-provided function an extra time on an empty input (:issue:`31760`) -- Joining on :class:`DatetimeIndex` or :class:`TimedeltaIndex` will preserve ``freq`` in simple cases (:issue:`32166`) -- + +**Other** + +- Fixed regression in joining on :class:`DatetimeIndex` or :class:`TimedeltaIndex` to preserve ``freq`` in simple cases (:issue:`32166`) +- Fixed regression in :meth:`Series.shift` with ``datetime64`` dtype when passing an integer ``fill_value`` (:issue:`32591`) +- Fixed regression in the repr of an object-dtype :class:`Index` with bools and missing values (:issue:`32146`) + .. --------------------------------------------------------------------------- @@ -62,8 +81,9 @@ Bug fixes **Datetimelike** -- Bug in :meth:`DataFrame.reindex` and :meth:`Series.reindex` when reindexing with a tz-aware index (:issue:`26683`) +- Bug in :meth:`Series.astype` not copying for tz-naive and tz-aware ``datetime64`` dtype (:issue:`32490`) - Bug where :func:`to_datetime` would raise when passed ``pd.NA`` (:issue:`32213`) +- Improved error message when subtracting two :class:`Timestamp` that result in an out-of-bounds :class:`Timedelta` (:issue:`31774`) **Categorical** @@ -74,14 +94,26 @@ Bug fixes **I/O** - Using ``pd.NA`` with :meth:`DataFrame.to_json` now correctly outputs a null value instead of an empty object (:issue:`31615`) +- Bug in :meth:`pandas.json_normalize` when value in meta path is not iterable (:issue:`31507`) +- Fixed pickling of ``pandas.NA``. Previously a new object was returned, which broke computations relying on ``NA`` being a singleton (:issue:`31847`) - Fixed bug in parquet roundtrip with nullable unsigned integer dtypes (:issue:`31896`). **Experimental dtypes** -- Fix bug in :meth:`DataFrame.convert_dtypes` for columns that were already using the ``"string"`` dtype (:issue:`31731`). +- Fixed bug in :meth:`DataFrame.convert_dtypes` for columns that were already using the ``"string"`` dtype (:issue:`31731`). +- Fixed bug in :meth:`DataFrame.convert_dtypes` for series with mix of integers and strings (:issue:`32117`) +- Fixed bug in :meth:`DataFrame.convert_dtypes` where ``BooleanDtype`` columns were converted to ``Int64`` (:issue:`32287`) - Fixed bug in setting values using a slice indexer with string dtype (:issue:`31772`) -- Fixed bug where :meth:`GroupBy.first` and :meth:`GroupBy.last` would raise a ``TypeError`` when groups contained ``pd.NA`` in a column of object dtype (:issue:`32123`) -- Fix bug in :meth:`Series.convert_dtypes` for series with mix of integers and strings (:issue:`32117`) +- Fixed bug where :meth:`pandas.core.groupby.GroupBy.first` and :meth:`pandas.core.groupby.GroupBy.last` would raise a ``TypeError`` when groups contained ``pd.NA`` in a column of object dtype (:issue:`32123`) +- Fixed bug where :meth:`DataFrameGroupBy.mean`, :meth:`DataFrameGroupBy.median`, :meth:`DataFrameGroupBy.var`, and :meth:`DataFrameGroupBy.std` would raise a ``TypeError`` on ``Int64`` dtype columns (:issue:`32219`) + +**Strings** + +- Using ``pd.NA`` with :meth:`Series.str.repeat` now correctly outputs a null value instead of raising error for vector inputs (:issue:`31632`) + +**Rolling** + +- Fixed rolling operations with variable window (defined by time duration) on decreasing time index (:issue:`32385`). .. --------------------------------------------------------------------------- @@ -90,4 +122,4 @@ Bug fixes Contributors ~~~~~~~~~~~~ -.. contributors:: v1.0.1..v1.0.2|HEAD +.. contributors:: v1.0.1..v1.0.2 diff --git a/doc/source/whatsnew/v1.0.3.rst b/doc/source/whatsnew/v1.0.3.rst new file mode 100644 index 0000000000000..26d06433bda0c --- /dev/null +++ b/doc/source/whatsnew/v1.0.3.rst @@ -0,0 +1,29 @@ + +.. _whatsnew_103: + +What's new in 1.0.3 (March 17, 2020) +------------------------------------ + +These are the changes in pandas 1.0.3. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_103.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- Fixed regression in ``resample.agg`` when the underlying data is non-writeable (:issue:`31710`) +- Fixed regression in :class:`DataFrame` exponentiation with reindexing (:issue:`32685`) + +.. _whatsnew_103.bug_fixes: + +Bug fixes +~~~~~~~~~ + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.0.2..v1.0.3|HEAD diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 2b64b85863def..5fe7d12188860 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -58,18 +58,67 @@ For example: For more on working with fold, see :ref:`Fold subsection ` in the user guide. +.. _whatsnew_110.to_datetime_multiple_tzname_tzoffset_support: + +Parsing timezone-aware format with different timezones in to_datetime +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`to_datetime` now supports parsing formats containing timezone names (``%Z``) and UTC offsets (``%z``) from different timezones then converting them to UTC by setting ``utc=True``. This would return a :class:`DatetimeIndex` with timezone at UTC as opposed to an :class:`Index` with ``object`` dtype if ``utc=True`` is not set (:issue:`32792`). + +For example: + +.. ipython:: python + + tz_strs = ["2010-01-01 12:00:00 +0100", "2010-01-01 12:00:00 -0100", + "2010-01-01 12:00:00 +0300", "2010-01-01 12:00:00 +0400"] + pd.to_datetime(tz_strs, format='%Y-%m-%d %H:%M:%S %z', utc=True) + pd.to_datetime(tz_strs, format='%Y-%m-%d %H:%M:%S %z') + .. _whatsnew_110.enhancements.other: Other enhancements ^^^^^^^^^^^^^^^^^^ - :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`) +- :meth:`Styler.highlight_null` now accepts ``subset`` argument (:issue:`31345`) - When writing directly to a sqlite connection :func:`to_sql` now supports the ``multi`` method (:issue:`29921`) - `OptionError` is now exposed in `pandas.errors` (:issue:`27553`) -- +- :func:`timedelta_range` will now infer a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`) +- Positional slicing on a :class:`IntervalIndex` now supports slices with ``step > 1`` (:issue:`31658`) +- :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`). +- :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) +- :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`) +- :class:`Series.dt` and :class:`DatatimeIndex` now have an `isocalendar` accessor that returns a :class:`DataFrame` with year, week, and day calculated according to the ISO 8601 calendar (:issue:`33206`). +- The :meth:`DataFrame.to_feather` method now supports additional keyword + arguments (e.g. to set the compression) that are added in pyarrow 0.17 + (:issue:`33422`). +- :meth:`DataFrame.to_csv`, :meth:`DataFrame.to_pickle`, + and :meth:`DataFrame.to_json` now support passing a dict of + compression arguments when using the ``gzip`` and ``bz2`` protocols. + This can be used to set a custom compression level, e.g., + ``df.to_csv(path, compression={'method': 'gzip', 'compresslevel': 1}`` + (:issue:`33196`) .. --------------------------------------------------------------------------- +Increased minimum versions for dependencies +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Some minimum supported versions of dependencies were updated (:issue:`29766`, :issue:`29723`). +If installed, we now require: + ++-----------------+-----------------+----------+---------+ +| Package | Minimum Version | Required | Changed | ++=================+=================+==========+=========+ +| python-dateutil | 2.7.3 | X | | ++-----------------+-----------------+----------+---------+ + + +Development Changes +^^^^^^^^^^^^^^^^^^^ + +- The minimum version of Cython is now the most recent bug-fix version (0.29.16) (:issue:`33334`). + .. _whatsnew_110.api.other: Other API changes @@ -80,28 +129,236 @@ Other API changes - Added :meth:`DataFrame.value_counts` (:issue:`5377`) - :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`) - ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`) +- Using a :func:`pandas.api.indexers.BaseIndexer` with ``std``, ``var``, ``count``, ``skew``, ``cov``, ``corr`` will now raise a ``NotImplementedError`` (:issue:`32865`) +- Using a :func:`pandas.api.indexers.BaseIndexer` with ``min``, ``max`` will now return correct results for any monotonic :func:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`) +- Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations. - Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - :meth:`DataFrame.swaplevels` now raises a ``TypeError`` if the axis is not a :class:`MultiIndex`. Previously a ``AttributeError`` was raised (:issue:`31126`) -- :meth:`DataFrameGroupby.mean` and :meth:`SeriesGroupby.mean` (and similarly for :meth:`~DataFrameGroupby.median`, :meth:`~DataFrameGroupby.std`` and :meth:`~DataFrameGroupby.var``) +- :meth:`DataFrameGroupby.mean` and :meth:`SeriesGroupby.mean` (and similarly for :meth:`~DataFrameGroupby.median`, :meth:`~DataFrameGroupby.std` and :meth:`~DataFrameGroupby.var`) now raise a ``TypeError`` if a not-accepted keyword argument is passed into it. - Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median``) (:issue:`31485`) + Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median`) (:issue:`31485`) - :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`) +- Passing an integer dtype other than ``int64`` to ``np.array(period_index, dtype=...)`` will now raise ``TypeError`` instead of incorrectly using ``int64`` (:issue:`32255`) + +``MultiIndex.get_indexer`` interprets `method` argument differently +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This restores the behavior of :meth:`MultiIndex.get_indexer` with ``method='backfill'`` or ``method='pad'`` to the behavior before pandas 0.23.0. In particular, MultiIndexes are treated as a list of tuples and padding or backfilling is done with respect to the ordering of these lists of tuples (:issue:`29896`). + +As an example of this, given: + +.. ipython:: python + + df = pd.DataFrame({ + 'a': [0, 0, 0, 0], + 'b': [0, 2, 3, 4], + 'c': ['A', 'B', 'C', 'D'], + }).set_index(['a', 'b']) + mi_2 = pd.MultiIndex.from_product([[0], [-1, 0, 1, 3, 4, 5]]) + +The differences in reindexing ``df`` with ``mi_2`` and using ``method='backfill'`` can be seen here: + +*pandas >= 0.23, < 1.1.0*: + +.. code-block:: ipython + + In [1]: df.reindex(mi_2, method='backfill') + Out[1]: + c + 0 -1 A + 0 A + 1 D + 3 A + 4 A + 5 C + +*pandas <0.23, >= 1.1.0* + +.. ipython:: python + + df.reindex(mi_2, method='backfill') + +And the differences in reindexing ``df`` with ``mi_2`` and using ``method='pad'`` can be seen here: + +*pandas >= 0.23, < 1.1.0* + +.. code-block:: ipython + + In [1]: df.reindex(mi_2, method='pad') + Out[1]: + c + 0 -1 NaN + 0 NaN + 1 D + 3 NaN + 4 A + 5 C + +*pandas < 0.23, >= 1.1.0* + +.. ipython:: python + + df.reindex(mi_2, method='pad') + - +.. _whatsnew_110.api_breaking.indexing_raises_key_errors: + +Failed Label-Based Lookups Always Raise KeyError +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Label lookups ``series[key]``, ``series.loc[key]`` and ``frame.loc[key]`` +used to raises either ``KeyError`` or ``TypeError`` depending on the type of +key and type of :class:`Index`. These now consistently raise ``KeyError`` (:issue:`31867`) + +.. ipython:: python + + ser1 = pd.Series(range(3), index=[0, 1, 2]) + ser2 = pd.Series(range(3), index=pd.date_range("2020-02-01", periods=3)) + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: ser1[1.5] + ... + TypeError: cannot do label indexing on Int64Index with these indexers [1.5] of type float + + In [4] ser1["foo"] + ... + KeyError: 'foo' + + In [5]: ser1.loc[1.5] + ... + TypeError: cannot do label indexing on Int64Index with these indexers [1.5] of type float + + In [6]: ser1.loc["foo"] + ... + KeyError: 'foo' + + In [7]: ser2.loc[1] + ... + TypeError: cannot do label indexing on DatetimeIndex with these indexers [1] of type int + + In [8]: ser2.loc[pd.Timestamp(0)] + ... + KeyError: Timestamp('1970-01-01 00:00:00') + +*New behavior*: + +.. code-block:: ipython + + In [3]: ser1[1.5] + ... + KeyError: 1.5 + + In [4] ser1["foo"] + ... + KeyError: 'foo' + + In [5]: ser1.loc[1.5] + ... + KeyError: 1.5 + + In [6]: ser1.loc["foo"] + ... + KeyError: 'foo' + + In [7]: ser2.loc[1] + ... + KeyError: 1 + + In [8]: ser2.loc[pd.Timestamp(0)] + ... + KeyError: Timestamp('1970-01-01 00:00:00') + +:meth:`DataFrame.merge` preserves right frame's row order +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`) + +.. ipython:: python + + left_df = pd.DataFrame({'animal': ['dog', 'pig'], 'max_speed': [40, 11]}) + right_df = pd.DataFrame({'animal': ['quetzal', 'pig'], 'max_speed': [80, 11]}) + left_df + right_df + +*Previous behavior*: + +.. code-block:: python + + >>> left_df.merge(right_df, on=['animal', 'max_speed'], how="right") + animal max_speed + 0 pig 11 + 1 quetzal 80 + +*New behavior*: + +.. ipython:: python + + left_df.merge(right_df, on=['animal', 'max_speed'], how="right") + .. --------------------------------------------------------------------------- +.. _whatsnew_110.api_breaking.assignment_to_multiple_columns: + +Assignment to multiple columns of a DataFrame when some columns do not exist +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Assignment to multiple columns of a :class:`DataFrame` when some of the columns do not exist would previously assign the values to the last column. Now, new columns would be constructed with the right values. (:issue:`13658`) + +.. ipython:: python + + df = pd.DataFrame({'a': [0, 1, 2], 'b': [3, 4, 5]}) + df + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: df[['a', 'c']] = 1 + In [4]: df + Out[4]: + a b + 0 1 1 + 1 1 1 + 2 1 1 + +*New behavior*: + +.. ipython:: python + + df[['a', 'c']] = 1 + df + .. _whatsnew_110.deprecations: Deprecations ~~~~~~~~~~~~ + - Lookups on a :class:`Series` with a single-item list containing a slice (e.g. ``ser[[slice(0, 4)]]``) are deprecated, will raise in a future version. Either convert the list to tuple, or pass the slice directly instead (:issue:`31333`) + - :meth:`DataFrame.mean` and :meth:`DataFrame.median` with ``numeric_only=None`` will include datetime64 and datetime64tz columns in a future version (:issue:`29941`) -- -- +- Setting values with ``.loc`` using a positional slice is deprecated and will raise in a future version. Use ``.loc`` with labels or ``.iloc`` with positions instead (:issue:`31840`) +- :meth:`DataFrame.to_dict` has deprecated accepting short names for ``orient`` in future versions (:issue:`32515`) +- :meth:`Categorical.to_dense` is deprecated and will be removed in a future version, use ``np.asarray(cat)`` instead (:issue:`32639`) +- The ``fastpath`` keyword in the ``SingleBlockManager`` constructor is deprecated and will be removed in a future version (:issue:`33092`) +- :meth:`Index.is_mixed` is deprecated and will be removed in a future version, check ``index.inferred_type`` directly instead (:issue:`32922`) + +- Passing any arguments but the first one to :func:`read_html` as + positional arguments is deprecated since version 1.1. All other + arguments should be given as keyword arguments (:issue:`27573`). + +- Passing any arguments but `path_or_buf` (the first one) to + :func:`read_json` as positional arguments is deprecated since + version 1.1. All other arguments should be given as keyword + arguments (:issue:`27573`). + +- :func:`pandas.api.types.is_categorical` is deprecated and will be removed in a future version; use `:func:pandas.api.types.is_categorical_dtype` instead (:issue:`33385`) .. --------------------------------------------------------------------------- @@ -113,8 +370,16 @@ Performance improvements - Performance improvement in :class:`Timedelta` constructor (:issue:`30543`) - Performance improvement in :class:`Timestamp` constructor (:issue:`30543`) -- -- +- Performance improvement in flex arithmetic ops between :class:`DataFrame` and :class:`Series` with ``axis=0`` (:issue:`31296`) +- The internal index method :meth:`~Index._shallow_copy` now copies cached attributes over to the new index, + avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of + existing indexes (:issue:`28584`, :issue:`32640`, :issue:`32669`) +- Significant performance improvement when creating a :class:`DataFrame` with + sparse values from ``scipy.sparse`` matrices using the + :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`, + :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). +- Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`). + .. --------------------------------------------------------------------------- @@ -126,9 +391,12 @@ Bug fixes Categorical ^^^^^^^^^^^ + +- Bug where :func:`merge` was unable to join on non-unique categorical indices (:issue:`28189`) - Bug when passing categorical data to :class:`Index` constructor along with ``dtype=object`` incorrectly returning a :class:`CategoricalIndex` instead of object-dtype :class:`Index` (:issue:`32167`) -- -- +- Bug where :class:`Categorical` comparison operator ``__ne__`` would incorrectly evaluate to ``False`` when either element was missing (:issue:`32276`) +- :meth:`Categorical.fillna` now accepts :class:`Categorical` ``other`` argument (:issue:`32420`) +- Bug where :meth:`Categorical.replace` would replace with ``NaN`` whenever the new value and replacement value were equal (:issue:`33288`) Datetimelike ^^^^^^^^^^^^ @@ -138,43 +406,55 @@ Datetimelike - Bug in :class:`Timestamp` where constructing :class:`Timestamp` with dateutil timezone less than 128 nanoseconds before daylight saving time switch from winter to summer would result in nonexistent time (:issue:`31043`) - Bug in :meth:`Period.to_timestamp`, :meth:`Period.start_time` with microsecond frequency returning a timestamp one nanosecond earlier than the correct time (:issue:`31475`) - :class:`Timestamp` raising confusing error message when year, month or day is missing (:issue:`31200`) +- Bug in :class:`DatetimeIndex` constructor incorrectly accepting ``bool``-dtyped inputs (:issue:`32668`) +- Bug in :meth:`DatetimeIndex.searchsorted` not accepting a ``list`` or :class:`Series` as its argument (:issue:`32762`) +- Bug where :meth:`PeriodIndex` raised when passed a :class:`Series` of strings (:issue:`26109`) +- Bug in :class:`Timestamp` arithmetic when adding or subtracting a ``np.ndarray`` with ``timedelta64`` dtype (:issue:`33296`) +- Bug in :meth:`DatetimeIndex.to_period` not infering the frequency when called with no arguments (:issue:`33358`) + Timedelta ^^^^^^^^^ - Bug in constructing a :class:`Timedelta` with a high precision integer that would round the :class:`Timedelta` components (:issue:`31354`) -- +- Bug in dividing ``np.nan`` or ``None`` by :class:`Timedelta`` incorrectly returning ``NaT`` (:issue:`31869`) +- Timedeltas now understand ``µs`` as identifier for microsecond (:issue:`32899`) +- :class:`Timedelta` string representation now includes nanoseconds, when nanoseconds are non-zero (:issue:`9309`) +- Bug in comparing a :class:`Timedelta`` object against a ``np.ndarray`` with ``timedelta64`` dtype incorrectly viewing all entries as unequal (:issue:`33441`) Timezones ^^^^^^^^^ -- +- Bug in :func:`to_datetime` with ``infer_datetime_format=True`` where timezone names (e.g. ``UTC``) would not be parsed correctly (:issue:`33133`) - Numeric ^^^^^^^ - Bug in :meth:`DataFrame.floordiv` with ``axis=0`` not treating division-by-zero like :meth:`Series.floordiv` (:issue:`31271`) -- +- Bug in :meth:`to_numeric` with string argument ``"uint64"`` and ``errors="coerce"`` silently fails (:issue:`32394`) +- Bug in :meth:`to_numeric` with ``downcast="unsigned"`` fails for empty data (:issue:`32493`) +- Bug in :meth:`DataFrame.mean` with ``numeric_only=False`` and either ``datetime64`` dtype or ``PeriodDtype`` column incorrectly raising ``TypeError`` (:issue:`32426`) +- Bug in :meth:`DataFrame.count` with ``level="foo"`` and index level ``"foo"`` containing NaNs causes segmentation fault (:issue:`21824`) +- Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`) - Conversion ^^^^^^^^^^ - Bug in :class:`Series` construction from NumPy array with big-endian ``datetime64`` dtype (:issue:`29684`) -- -- +- Bug in :class:`Timedelta` construction with large nanoseconds keyword value (:issue:`32402`) +- Bug in :class:`DataFrame` construction where sets would be duplicated rather than raising (:issue:`32582`) Strings ^^^^^^^ -- -- +- Bug in the :meth:`~Series.astype` method when converting "string" dtype data to nullable integer dtype (:issue:`32450`). +- Bug in :meth:`Series.str.cat` returning ``NaN`` output when other had :class:`Index` type (:issue:`33425`) Interval ^^^^^^^^ - -- +- Bug in :class:`IntervalArray` incorrectly allowing the underlying data to be changed when setting values (:issue:`32782`) - Indexing @@ -187,13 +467,22 @@ Indexing - Bug in :meth:`Series.xs` incorrectly returning ``Timestamp`` instead of ``datetime64`` in some object-dtype cases (:issue:`31630`) - Bug in :meth:`DataFrame.iat` incorrectly returning ``Timestamp`` instead of ``datetime`` in some object-dtype cases (:issue:`32809`) - Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` when indexing with an integer key on a object-dtype :class:`Index` that is not all-integers (:issue:`31905`) -- +- Bug in :meth:`DataFrame.iloc.__setitem__` on a :class:`DataFrame` with duplicate columns incorrectly setting values for all matching columns (:issue:`15686`, :issue:`22036`) +- Bug in :meth:`DataFrame.loc:` and :meth:`Series.loc` with a :class:`DatetimeIndex`, :class:`TimedeltaIndex`, or :class:`PeriodIndex` incorrectly allowing lookups of non-matching datetime-like dtypes (:issue:`32650`) +- Bug in :meth:`Series.__getitem__` indexing with non-standard scalars, e.g. ``np.dtype`` (:issue:`32684`) +- Fix to preserve the ability to index with the "nearest" method with xarray's CFTimeIndex, an :class:`Index` subclass (`pydata/xarray#3751 `_, :issue:`32905`). +- Bug in :class:`Index` constructor where an unhelpful error message was raised for ``numpy`` scalars (:issue:`33017`) +- Bug in :meth:`DataFrame.lookup` incorrectly raising an ``AttributeError`` when ``frame.index`` or ``frame.columns`` is not unique; this will now raise a ``ValueError`` with a helpful error message (:issue:`33041`) +- Bug in :meth:`DataFrame.iloc.__setitem__` creating a new array instead of overwriting ``Categorical`` values in-place (:issue:`32831`) +- Bug in :meth:`DataFrame.copy` _item_cache not invalidated after copy causes post-copy value updates to not be reflected (:issue:`31784`) +- Bug in `Series.__getitem__` with an integer key and a :class:`MultiIndex` with leading integer level failing to raise ``KeyError`` if the key is not present in the first level (:issue:`33355`) +- Bug in :meth:`DataFrame.iloc` when slicing a single column-:class:`DataFrame`` with ``ExtensionDtype`` (e.g. ``df.iloc[:, :1]``) returning an invalid result (:issue:`32957`) Missing ^^^^^^^ - -- -- +- Calling :meth:`fillna` on an empty Series now correctly returns a shallow copied object. The behaviour is now consistent with :class:`Index`, :class:`DataFrame` and a non-empty :class:`Series` (:issue:`32543`). +- Bug in :meth:`replace` when argument ``to_replace`` is of type dict/list and is used on a :class:`Series` containing ```` was raising a ``TypeError``. The method now handles this by ignoring ```` values when doing the comparison for the replacement (:issue:`32621`) +- Bug in :meth:`~Series.any` and :meth:`~Series.all` incorrectly returning ```` for all ``False`` or all ``True`` values using the nulllable boolean dtype and with ``skipna=False`` (:issue:`33253`) MultiIndex ^^^^^^^^^^ @@ -219,20 +508,33 @@ MultiIndex I/O ^^^ -- Bug in :meth:`read_json` where integer overflow was occuring when json contains big number strings. (:issue:`30320`) +- Bug in :meth:`read_json` where integer overflow was occurring when json contains big number strings. (:issue:`30320`) - `read_csv` will now raise a ``ValueError`` when the arguments `header` and `prefix` both are not `None`. (:issue:`27394`) - Bug in :meth:`DataFrame.to_json` was raising ``NotFoundError`` when ``path_or_buf`` was an S3 URI (:issue:`28375`) - Bug in :meth:`DataFrame.to_parquet` overwriting pyarrow's default for ``coerce_timestamps``; following pyarrow's default allows writing nanosecond timestamps with ``version="2.0"`` (:issue:`31652`). +- Bug in :meth:`read_csv` was raising `TypeError` when `sep=None` was used in combination with `comment` keyword (:issue:`31396`) - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`) +- Bug in :meth:`DataFrame.to_json` where ``Timedelta`` objects would not be serialized correctly with ``date_format="iso"`` (:issue:`28256`) +- :func:`read_csv` will raise a ``ValueError`` when the column names passed in `parse_dates` are missing in the Dataframe (:issue:`31251`) +- Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`) +- Bug in :meth:`read_csv` was causing a file descriptor leak on an empty file (:issue:`31488`) +- Bug in :meth:`read_csv` was causing a segfault when there were blank lines between the header and data rows (:issue:`28071`) +- Bug in :meth:`read_csv` was raising a misleading exception on a permissions issue (:issue:`23784`) +- Bug in :meth:`read_csv` was raising an ``IndexError`` when header=None and 2 extra data columns +- Bug in :meth:`read_sas` was raising an ``AttributeError`` when reading files from Google Cloud Storage (issue:`33069`) +- Bug in :meth:`DataFrame.to_sql` where an ``AttributeError`` was raised when saving an out of bounds date (:issue:`26761`) +- Bug in :meth:`read_excel` did not correctly handle multiple embedded spaces in OpenDocument text cells. (:issue:`32207`) +- Bug in :meth:`read_json` was raising ``TypeError`` when reading a list of booleans into a Series. (:issue:`31464`) Plotting ^^^^^^^^ - :func:`.plot` for line/bar now accepts color by dictonary (:issue:`8193`). -- +- Bug in :meth:`DataFrame.plot.hist` where weights are not working for multiple columns (:issue:`33173`) - Bug in :meth:`DataFrame.boxplot` and :meth:`DataFrame.plot.boxplot` lost color attributes of ``medianprops``, ``whiskerprops``, ``capprops`` and ``medianprops`` (:issue:`30346`) +- Bug in :meth:`DataFrame.plot.scatter` that when adding multiple plots with different ``cmap``, colorbars alway use the first ``cmap`` (:issue:`33389`) Groupby/resample/rolling @@ -240,6 +542,13 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) - Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`) +- Bug in :meth:`GroupBy.count` causes segmentation fault when grouped-by column contains NaNs (:issue:`32841`) +- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean series (:issue:`32894`) +- Bug in :meth:`SeriesGroupBy.quantile` raising on nullable integers (:issue:`33136`) +- Bug in :meth:`SeriesGroupBy.first`, :meth:`SeriesGroupBy.last`, :meth:`SeriesGroupBy.min`, and :meth:`SeriesGroupBy.max` returning floats when applied to nullable Booleans (:issue:`33071`) +- Bug in :meth:`DataFrameGroupBy.agg` with dictionary input losing ``ExtensionArray`` dtypes (:issue:`32194`) +- Bug in :meth:`DataFrame.resample` where an ``AmbiguousTimeError`` would be raised when the resulting timezone aware :class:`DatetimeIndex` had a DST transition at midnight (:issue:`25758`) +- Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`) Reshaping ^^^^^^^^^ @@ -252,18 +561,27 @@ Reshaping - Bug in :func:`crosstab` when inputs are two Series and have tuple names, the output will keep dummy MultiIndex as columns. (:issue:`18321`) - :meth:`DataFrame.pivot` can now take lists for ``index`` and ``columns`` arguments (:issue:`21425`) - Bug in :func:`concat` where the resulting indices are not copied when ``copy=True`` (:issue:`29879`) +- Bug where :meth:`Index.astype` would lose the name attribute when converting from ``Float64Index`` to ``Int64Index``, or when casting to an ``ExtensionArray`` dtype (:issue:`32013`) +- :meth:`Series.append` will now raise a ``TypeError`` when passed a DataFrame or a sequence containing Dataframe (:issue:`31413`) +- :meth:`DataFrame.replace` and :meth:`Series.replace` will raise a ``TypeError`` if ``to_replace`` is not an expected type. Previously the ``replace`` would fail silently (:issue:`18634`) +- Bug on inplace operation of a Series that was adding a column to the DataFrame from where it was originally dropped from (using inplace=True) (:issue:`30484`) +- Bug in :meth:`DataFrame.apply` where callback was called with :class:`Series` parameter even though ``raw=True`` requested. (:issue:`32423`) +- Bug in :meth:`DataFrame.pivot_table` losing timezone information when creating a :class:`MultiIndex` level from a column with timezone-aware dtype (:issue:`32558`) +- Bug in :meth:`concat` where when passing a non-dict mapping as ``objs`` would raise a ``TypeError`` (:issue:`32863`) +- :meth:`DataFrame.agg` now provides more descriptive ``SpecificationError`` message when attempting to aggregating non-existant column (:issue:`32755`) +- Bug in :meth:`DataFrame.unstack` when MultiIndexed columns and MultiIndexed rows were used (:issue:`32624`, :issue:`24729` and :issue:`28306`) Sparse ^^^^^^ - +- Creating a :class:`SparseArray` from timezone-aware dtype will issue a warning before dropping timezone information, instead of doing so silently (:issue:`32501`) - - ExtensionArray ^^^^^^^^^^^^^^ -- +- Fixed bug where :meth:`Serires.value_counts` would raise on empty input of ``Int64`` dtype (:issue:`33317`) - @@ -273,7 +591,15 @@ Other instead of ``TypeError: Can only append a Series if ignore_index=True or if the Series has a name`` (:issue:`30871`) - Set operations on an object-dtype :class:`Index` now always return object-dtype results (:issue:`31401`) - Bug in :meth:`AbstractHolidayCalendar.holidays` when no rules were defined (:issue:`31415`) -- +- Bug in :class:`DataFrame` when initiating a frame with lists and assign ``columns`` with nested list for ``MultiIndex`` (:issue:`32173`) +- Bug in :meth:`DataFrame.to_records` incorrectly losing timezone information in timezone-aware ``datetime64`` columns (:issue:`32535`) +- Fixed :func:`pandas.testing.assert_series_equal` to correctly raise if left object is a different subclass with ``check_series_type=True`` (:issue:`32670`). +- :meth:`IntegerArray.astype` now supports ``datetime64`` dtype (:issue:32538`) +- Getting a missing attribute in a query/eval string raises the correct ``AttributeError`` (:issue:`32408`) +- Fixed bug in :func:`pandas.testing.assert_series_equal` where dtypes were checked for ``Interval`` and ``ExtensionArray`` operands when ``check_dtype`` was ``False`` (:issue:`32747`) +- Bug in :meth:`Series.map` not raising on invalid ``na_action`` (:issue:`32815`) +- Bug in :meth:`DataFrame.__dir__` caused a segfault when using unicode surrogates in a column name (:issue:`25509`) +- Bug in :meth:`DataFrame.plot.scatter` caused an error when plotting variable marker sizes (:issue:`32904`) .. --------------------------------------------------------------------------- diff --git a/doc/sphinxext/announce.py b/doc/sphinxext/announce.py index e4859157f73de..9c175e4e58b45 100755 --- a/doc/sphinxext/announce.py +++ b/doc/sphinxext/announce.py @@ -68,8 +68,21 @@ def get_authors(revision_range): revision_range = f"{lst_release}..{cur_release}" # authors, in current release and previous to current release. - cur = set(re.findall(pat, this_repo.git.shortlog("-s", revision_range), re.M)) - pre = set(re.findall(pat, this_repo.git.shortlog("-s", lst_release), re.M)) + # We need two passes over the log for cur and prev, one to get the + # "Co-authored by" commits, which come from backports by the bot, + # and one for regular commits. + xpr = re.compile(r"Co-authored-by: (?P[^<]+) ") + cur = set( + xpr.findall( + this_repo.git.log("--grep=Co-authored", "--pretty=%b", revision_range) + ) + ) + cur |= set(re.findall(pat, this_repo.git.shortlog("-s", revision_range), re.M)) + + pre = set( + xpr.findall(this_repo.git.log("--grep=Co-authored", "--pretty=%b", lst_release)) + ) + pre |= set(re.findall(pat, this_repo.git.shortlog("-s", lst_release), re.M)) # Homu is the author of auto merges, clean him out. cur.discard("Homu") @@ -122,14 +135,15 @@ def build_string(revision_range, heading="Contributors"): components["uline"] = "=" * len(components["heading"]) components["authors"] = "* " + "\n* ".join(components["authors"]) + # Don't change this to an fstring. It breaks the formatting. tpl = textwrap.dedent( - f"""\ - {components['heading']} - {components['uline']} + """\ + {heading} + {uline} - {components['author_message']} - {components['authors']}""" - ) + {author_message} + {authors}""" + ).format(**components) return tpl diff --git a/environment.yml b/environment.yml index cbdaf8e6c4217..67b2df4dc5a0e 100644 --- a/environment.yml +++ b/environment.yml @@ -4,15 +4,15 @@ channels: dependencies: # required - numpy>=1.15 - - python=3.7 - - python-dateutil>=2.6.1 + - python=3 + - python-dateutil>=2.7.3 - pytz # benchmarks - asv # building - - cython>=0.29.13 + - cython>=0.29.16 # code checks - black=19.10b0 @@ -86,7 +86,7 @@ dependencies: - lxml # pd.read_excel, DataFrame.to_excel, pd.ExcelWriter, pd.ExcelFile - - openpyxl<=3.0.1 + - openpyxl - xlrd - xlsxwriter - xlwt @@ -101,8 +101,9 @@ dependencies: - s3fs # pandas.read_csv... when using 's3://...' path - sqlalchemy # pandas.read_sql, DataFrame.to_sql - xarray # DataFrame.to_xarray + - cftime # Needed for downstream xarray.CFTimeIndex test - pyreadstat # pandas.read_spss - tabulate>=0.8.3 # DataFrame.to_markdown - pip: - - git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master + - git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master - git+https://github.com/numpy/numpydoc diff --git a/pandas/__init__.py b/pandas/__init__.py index 2d3d3f7d92a9c..2b9a461e0e95d 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -37,7 +37,7 @@ f"C extension: {module} not built. If you want to import " "pandas from the source directory, you may need to run " "'python setup.py build_ext --inplace --force' to build the C extensions first." - ) + ) from e from pandas._config import ( get_option, @@ -290,8 +290,8 @@ def __getattr__(self, item): try: return getattr(self.np, item) - except AttributeError: - raise AttributeError(f"module numpy has no attribute {item}") + except AttributeError as err: + raise AttributeError(f"module numpy has no attribute {item}") from err np = __numpy() @@ -306,8 +306,10 @@ def __getattr__(cls, item): try: return getattr(cls.datetime, item) - except AttributeError: - raise AttributeError(f"module datetime has no attribute {item}") + except AttributeError as err: + raise AttributeError( + f"module datetime has no attribute {item}" + ) from err def __instancecheck__(cls, other): return isinstance(other, cls.datetime) diff --git a/pandas/_config/config.py b/pandas/_config/config.py index f1959cd70ed3a..8955a06187109 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -51,20 +51,11 @@ from collections import namedtuple from contextlib import contextmanager import re -from typing import ( - Any, - Callable, - Dict, - Iterable, - List, - Optional, - Tuple, - Type, - TypeVar, - cast, -) +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type, cast import warnings +from pandas._typing import F + DeprecatedOption = namedtuple("DeprecatedOption", "key msg rkey removal_ver") RegisteredOption = namedtuple("RegisteredOption", "key defval doc validator cb") @@ -213,8 +204,8 @@ def __getattr__(self, key: str): prefix += key try: v = object.__getattribute__(self, "d")[key] - except KeyError: - raise OptionError("No such option") + except KeyError as err: + raise OptionError("No such option") from err if isinstance(v, dict): return DictWrapper(v, prefix) else: @@ -704,9 +695,6 @@ def pp(name: str, ks: Iterable[str]) -> List[str]: # # helpers -FuncType = Callable[..., Any] -F = TypeVar("F", bound=FuncType) - @contextmanager def config_prefix(prefix): diff --git a/pandas/_libs/__init__.py b/pandas/_libs/__init__.py index af67cb3be7102..141ca0645b906 100644 --- a/pandas/_libs/__init__.py +++ b/pandas/_libs/__init__.py @@ -1,6 +1,15 @@ -# flake8: noqa +__all__ = [ + "NaT", + "NaTType", + "OutOfBoundsDatetime", + "Period", + "Timedelta", + "Timestamp", + "iNaT", +] -from .tslibs import ( + +from pandas._libs.tslibs import ( NaT, NaTType, OutOfBoundsDatetime, diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index b7f17aee35a44..6b6ead795584f 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -38,23 +38,29 @@ cimport pandas._libs.util as util from pandas._libs.util cimport numeric, get_nat from pandas._libs.khash cimport ( - khiter_t, kh_destroy_int64, kh_put_int64, kh_init_int64, kh_int64_t, - kh_resize_int64, kh_get_int64) - -import pandas._libs.missing as missing + kh_destroy_int64, + kh_get_int64, + kh_init_int64, + kh_int64_t, + kh_put_int64, + kh_resize_int64, + khiter_t, +) -cdef float64_t FP_ERR = 1e-13 -cdef float64_t NaN = np.NaN +import pandas._libs.missing as missing -cdef int64_t NPY_NAT = get_nat() +cdef: + float64_t FP_ERR = 1e-13 + float64_t NaN = np.NaN + int64_t NPY_NAT = get_nat() tiebreakers = { - 'average': TIEBREAK_AVERAGE, - 'min': TIEBREAK_MIN, - 'max': TIEBREAK_MAX, - 'first': TIEBREAK_FIRST, - 'dense': TIEBREAK_DENSE, + "average": TIEBREAK_AVERAGE, + "min": TIEBREAK_MIN, + "max": TIEBREAK_MAX, + "first": TIEBREAK_FIRST, + "dense": TIEBREAK_DENSE, } @@ -113,6 +119,7 @@ cpdef ndarray[int64_t, ndim=1] unique_deltas(const int64_t[:] arr): kh_int64_t *table int ret = 0 list uniques = [] + ndarray[int64_t, ndim=1] result table = kh_init_int64() kh_resize_int64(table, 10) @@ -254,7 +261,7 @@ def kth_smallest(numeric[:] a, Py_ssize_t k) -> numeric: @cython.boundscheck(False) @cython.wraparound(False) -def nancorr(const float64_t[:, :] mat, bint cov=0, minp=None): +def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): cdef: Py_ssize_t i, j, xi, yi, N, K bint minpv @@ -318,7 +325,7 @@ def nancorr(const float64_t[:, :] mat, bint cov=0, minp=None): @cython.boundscheck(False) @cython.wraparound(False) -def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): +def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1) -> ndarray: cdef: Py_ssize_t i, j, xi, yi, N, K ndarray[float64_t, ndim=2] result @@ -574,7 +581,7 @@ D @cython.boundscheck(False) @cython.wraparound(False) -def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None): +def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray: cdef: Py_ssize_t i, j, nleft, nright ndarray[int64_t, ndim=1] indexer @@ -791,25 +798,26 @@ ctypedef fused rank_t: @cython.wraparound(False) @cython.boundscheck(False) -def rank_1d(rank_t[:] in_arr, ties_method='average', - ascending=True, na_option='keep', pct=False): +def rank_1d( + rank_t[:] in_arr, + ties_method="average", + bint ascending=True, + na_option="keep", + bint pct=False, +): """ Fast NaN-friendly version of ``scipy.stats.rankdata``. """ cdef: Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0 - ndarray[rank_t] sorted_data, values - ndarray[float64_t] ranks ndarray[int64_t] argsorted ndarray[uint8_t, cast=True] sorted_mask - rank_t val, nan_value - float64_t sum_ranks = 0 int tiebreak = 0 - bint keep_na = 0 + bint keep_na = False bint isnan, condition float64_t count = 0.0 @@ -1009,26 +1017,27 @@ def rank_1d(rank_t[:] in_arr, ties_method='average', return ranks -def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average', - ascending=True, na_option='keep', pct=False): +def rank_2d( + rank_t[:, :] in_arr, + int axis=0, + ties_method="average", + bint ascending=True, + na_option="keep", + bint pct=False, +): """ Fast NaN-friendly version of ``scipy.stats.rankdata``. """ cdef: Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 - Py_ssize_t infs - ndarray[float64_t, ndim=2] ranks ndarray[rank_t, ndim=2] values - ndarray[int64_t, ndim=2] argsorted - rank_t val, nan_value - float64_t sum_ranks = 0 int tiebreak = 0 - bint keep_na = 0 + bint keep_na = False float64_t count = 0.0 bint condition, skip_condition @@ -1190,9 +1199,12 @@ ctypedef fused out_t: @cython.boundscheck(False) @cython.wraparound(False) -def diff_2d(diff_t[:, :] arr, - out_t[:, :] out, - Py_ssize_t periods, int axis): +def diff_2d( + diff_t[:, :] arr, + out_t[:, :] out, + Py_ssize_t periods, + int axis, +): cdef: Py_ssize_t i, j, sx, sy, start, stop bint f_contig = arr.is_f_contig() diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 27b3095d8cb4f..e7ac3b8442c6d 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -848,11 +848,13 @@ cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil: return val != val +# GH#31710 use memorviews once cython 0.30 is released so we can +# use `const rank_t[:, :] values` @cython.wraparound(False) @cython.boundscheck(False) def group_last(rank_t[:, :] out, int64_t[:] counts, - rank_t[:, :] values, + ndarray[rank_t, ndim=2] values, const int64_t[:] labels, Py_ssize_t min_count=-1): """ @@ -867,7 +869,9 @@ def group_last(rank_t[:, :] out, assert min_count == -1, "'min_count' only used in add and prod" - if not len(values) == len(labels): + # TODO(cython 3.0): + # Instead of `labels.shape[0]` use `len(labels)` + if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) @@ -937,11 +941,13 @@ def group_last(rank_t[:, :] out, raise RuntimeError("empty group with uint64_t") +# GH#31710 use memorviews once cython 0.30 is released so we can +# use `const rank_t[:, :] values` @cython.wraparound(False) @cython.boundscheck(False) def group_nth(rank_t[:, :] out, int64_t[:] counts, - rank_t[:, :] values, + ndarray[rank_t, ndim=2] values, const int64_t[:] labels, int64_t rank=1, Py_ssize_t min_count=-1): """ @@ -956,7 +962,9 @@ def group_nth(rank_t[:, :] out, assert min_count == -1, "'min_count' only used in add and prod" - if not len(values) == len(labels): + # TODO(cython 3.0): + # Instead of `labels.shape[0]` use `len(labels)` + if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) @@ -1235,7 +1243,7 @@ ctypedef fused groupby_t: @cython.boundscheck(False) def group_max(groupby_t[:, :] out, int64_t[:] counts, - groupby_t[:, :] values, + ndarray[groupby_t, ndim=2] values, const int64_t[:] labels, Py_ssize_t min_count=-1): """ @@ -1250,7 +1258,9 @@ def group_max(groupby_t[:, :] out, assert min_count == -1, "'min_count' only used in add and prod" - if not len(values) == len(labels): + # TODO(cython 3.0): + # Instead of `labels.shape[0]` use `len(labels)` + if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) @@ -1308,7 +1318,7 @@ def group_max(groupby_t[:, :] out, @cython.boundscheck(False) def group_min(groupby_t[:, :] out, int64_t[:] counts, - groupby_t[:, :] values, + ndarray[groupby_t, ndim=2] values, const int64_t[:] labels, Py_ssize_t min_count=-1): """ @@ -1323,7 +1333,9 @@ def group_min(groupby_t[:, :] out, assert min_count == -1, "'min_count' only used in add and prod" - if not len(values) == len(labels): + # TODO(cython 3.0): + # Instead of `labels.shape[0]` use `len(labels)` + if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 884db9ee931d4..e80f134290a7e 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -86,7 +86,10 @@ cdef class Factorizer: self, ndarray[object] values, sort=False, na_sentinel=-1, na_value=None ): """ + Examples + -------- Factorize values with nans replaced by na_sentinel + >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) array([ 0, 1, 20]) """ @@ -131,7 +134,10 @@ cdef class Int64Factorizer: def factorize(self, const int64_t[:] values, sort=False, na_sentinel=-1, na_value=None): """ + Examples + -------- Factorize values with nans replaced by na_sentinel + >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) array([ 0, 1, 20]) """ diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 811025a4b5764..3ce3bc519b311 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -12,6 +12,9 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in from pandas._libs.tslibs.util cimport get_c_string from pandas._libs.missing cimport C_NA +cdef extern from "Python.h": + void PyErr_Clear() + {{py: # name, dtype, c_type @@ -193,7 +196,7 @@ cdef class StringVector: append_data_string(self.data, x) - cdef extend(self, ndarray[:] x): + cdef extend(self, ndarray[object] x): for i in range(len(x)): self.append(x[i]) @@ -238,7 +241,7 @@ cdef class ObjectVector: self.external_view_exists = True return self.ao - cdef extend(self, ndarray[:] x): + cdef extend(self, ndarray[object] x): for i in range(len(x)): self.append(x[i]) @@ -671,7 +674,7 @@ cdef class StringHashTable(HashTable): val = values[i] if isinstance(val, str): - # GH#31499 if we have a np.str_ get_c_string wont recognize + # GH#31499 if we have a np.str_ get_c_string won't recognize # it as a str, even though isinstance does. v = get_c_string(val) else: @@ -706,7 +709,7 @@ cdef class StringHashTable(HashTable): val = values[i] if isinstance(val, str): - # GH#31499 if we have a np.str_ get_c_string wont recognize + # GH#31499 if we have a np.str_ get_c_string won't recognize # it as a str, even though isinstance does. v = get_c_string(val) else: @@ -790,6 +793,9 @@ cdef class StringHashTable(HashTable): else: # if ignore_na is False, we also stringify NaN/None/etc. v = get_c_string(val) + if v == NULL: + PyErr_Clear() + v = get_c_string(repr(val)) vecs[i] = v # compute diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index f8f3858b803a5..6e5509a5570e8 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -206,7 +206,7 @@ def duplicated_{{dtype}}({{c_type}}[:] values, object keep='first'): {{if dtype == 'object'}} def ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values): {{else}} -def ismember_{{dtype}}({{c_type}}[:] arr, {{c_type}}[:] values): +def ismember_{{dtype}}(const {{c_type}}[:] arr, {{c_type}}[:] values): {{endif}} """ Return boolean of values in arr on an diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 6141e2b78e9f4..d8e0d9c6bd7ab 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -2,10 +2,19 @@ import warnings import numpy as np cimport numpy as cnp -from numpy cimport (ndarray, intp_t, - float64_t, float32_t, - int64_t, int32_t, int16_t, int8_t, - uint64_t, uint32_t, uint16_t, uint8_t +from numpy cimport ( + float32_t, + float64_t, + int8_t, + int16_t, + int32_t, + int64_t, + intp_t, + ndarray, + uint8_t, + uint16_t, + uint32_t, + uint64_t, ) cnp.import_array() @@ -115,8 +124,6 @@ cdef class IndexEngine: cdef _maybe_get_bool_indexer(self, object val): cdef: ndarray[uint8_t, ndim=1, cast=True] indexer - ndarray[intp_t, ndim=1] found - int count indexer = self._get_index_values() == val return self._unpack_bool_indexer(indexer, val) @@ -366,7 +373,7 @@ cdef class ObjectEngine(IndexEngine): cdef class DatetimeEngine(Int64Engine): - cdef _get_box_dtype(self): + cdef str _get_box_dtype(self): return 'M8[ns]' cdef int64_t _unbox_scalar(self, scalar) except? -1: @@ -456,7 +463,7 @@ cdef class DatetimeEngine(Int64Engine): cdef class TimedeltaEngine(DatetimeEngine): - cdef _get_box_dtype(self): + cdef str _get_box_dtype(self): return 'm8[ns]' cdef int64_t _unbox_scalar(self, scalar) except? -1: @@ -472,7 +479,6 @@ cdef class PeriodEngine(Int64Engine): return scalar.value if isinstance(scalar, Period): # NB: we assume that we have the correct freq here. - # TODO: potential optimize by checking for _Period? return scalar.ordinal raise TypeError(scalar) @@ -606,25 +612,113 @@ cdef class BaseMultiIndexCodesEngine: in zip(self.levels, zip(*target))] return self._codes_to_ints(np.array(level_codes, dtype='uint64').T) - def get_indexer(self, object target, object method=None, - object limit=None): + def get_indexer_no_fill(self, object target) -> np.ndarray: + """ + Returns an array giving the positions of each value of `target` in + `self.values`, where -1 represents a value in `target` which does not + appear in `self.values` + + Parameters + ---------- + target : list-like of keys + Each key is a tuple, with a label for each level of the index + + Returns + ------- + np.ndarray[int64_t, ndim=1] of the indexer of `target` into + `self.values` + """ lab_ints = self._extract_level_codes(target) + return self._base.get_indexer(self, lab_ints) - # All methods (exact, backfill, pad) directly map to the respective - # methods of the underlying (integers) index... - if method is not None: - # but underlying backfill and pad methods require index and keys - # to be sorted. The index already is (checked in - # Index._get_fill_indexer), sort (integer representations of) keys: - order = np.argsort(lab_ints) - lab_ints = lab_ints[order] - indexer = (getattr(self._base, f'get_{method}_indexer') - (self, lab_ints, limit=limit)) - indexer = indexer[order] - else: - indexer = self._base.get_indexer(self, lab_ints) + def get_indexer(self, object target, object values = None, + object method = None, object limit = None) -> np.ndarray: + """ + Returns an array giving the positions of each value of `target` in + `values`, where -1 represents a value in `target` which does not + appear in `values` - return indexer + If `method` is "backfill" then the position for a value in `target` + which does not appear in `values` is that of the next greater value + in `values` (if one exists), and -1 if there is no such value. + + Similarly, if the method is "pad" then the position for a value in + `target` which does not appear in `values` is that of the next smaller + value in `values` (if one exists), and -1 if there is no such value. + + Parameters + ---------- + target: list-like of tuples + need not be sorted, but all must have the same length, which must be + the same as the length of all tuples in `values` + values : list-like of tuples + must be sorted and all have the same length. Should be the set of + the MultiIndex's values. Needed only if `method` is not None + method: string + "backfill" or "pad" + limit: int, optional + if provided, limit the number of fills to this value + + Returns + ------- + np.ndarray[int64_t, ndim=1] of the indexer of `target` into `values`, + filled with the `method` (and optionally `limit`) specified + """ + if method is None: + return self.get_indexer_no_fill(target) + + assert method in ("backfill", "pad") + cdef: + int64_t i, j, next_code + int64_t num_values, num_target_values + ndarray[int64_t, ndim=1] target_order + ndarray[object, ndim=1] target_values + ndarray[int64_t, ndim=1] new_codes, new_target_codes + ndarray[int64_t, ndim=1] sorted_indexer + + target_order = np.argsort(target.values).astype('int64') + target_values = target.values[target_order] + num_values, num_target_values = len(values), len(target_values) + new_codes, new_target_codes = ( + np.empty((num_values,)).astype('int64'), + np.empty((num_target_values,)).astype('int64'), + ) + + # `values` and `target_values` are both sorted, so we walk through them + # and memoize the (ordered) set of indices in the (implicit) merged-and + # sorted list of the two which belong to each of them + # the effect of this is to create a factorization for the (sorted) + # merger of the index values, where `new_codes` and `new_target_codes` + # are the subset of the factors which appear in `values` and `target`, + # respectively + i, j, next_code = 0, 0, 0 + while i < num_values and j < num_target_values: + val, target_val = values[i], target_values[j] + if val <= target_val: + new_codes[i] = next_code + i += 1 + if target_val <= val: + new_target_codes[j] = next_code + j += 1 + next_code += 1 + + # at this point, at least one should have reached the end + # the remaining values of the other should be added to the end + assert i == num_values or j == num_target_values + while i < num_values: + new_codes[i] = next_code + i += 1 + next_code += 1 + while j < num_target_values: + new_target_codes[j] = next_code + j += 1 + next_code += 1 + + # get the indexer, and undo the sorting of `target.values` + sorted_indexer = ( + algos.backfill if method == "backfill" else algos.pad + )(new_codes, new_target_codes, limit=limit).astype('int64') + return sorted_indexer[np.argsort(target_order)] def get_loc(self, object key): if is_definitely_invalid_key(key): diff --git a/pandas/_libs/indexing.pyx b/pandas/_libs/indexing.pyx index 316943edee124..f9aedeb8ad93e 100644 --- a/pandas/_libs/indexing.pyx +++ b/pandas/_libs/indexing.pyx @@ -2,7 +2,8 @@ cdef class _NDFrameIndexerBase: """ A base class for _NDFrameIndexer for fast instantiation and attribute access. """ - cdef public object obj, name, _ndim + cdef public: + object obj, name, _ndim def __init__(self, name, obj): self.obj = obj diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 8bbbc6db94842..d3d8bead88d08 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -1,4 +1,5 @@ import cython +from collections import defaultdict from cython import Py_ssize_t from cpython.slice cimport PySlice_GetIndicesEx @@ -7,7 +8,9 @@ cdef extern from "Python.h": Py_ssize_t PY_SSIZE_T_MAX import numpy as np -from numpy cimport int64_t +cimport numpy as cnp +from numpy cimport NPY_INT64, int64_t +cnp.import_array() from pandas._libs.algos import ensure_int64 @@ -17,7 +20,6 @@ cdef class BlockPlacement: cdef: slice _as_slice object _as_array - bint _has_slice, _has_array, _is_known_slice_like def __init__(self, val): @@ -29,7 +31,11 @@ cdef class BlockPlacement: self._has_slice = False self._has_array = False - if isinstance(val, slice): + if isinstance(val, int): + slc = slice(val, val + 1, 1) + self._as_slice = slc + self._has_slice = True + elif isinstance(val, slice): slc = slice_canonize(val) if slc.start != slc.stop: @@ -49,12 +55,13 @@ cdef class BlockPlacement: def __str__(self) -> str: cdef: slice s = self._ensure_has_slice() + if s is not None: v = self._as_slice else: v = self._as_array - return f'{type(self).__name__}({v})' + return f"{type(self).__name__}({v})" def __repr__(self) -> str: return str(self) @@ -62,6 +69,7 @@ cdef class BlockPlacement: def __len__(self) -> int: cdef: slice s = self._ensure_has_slice() + if s is not None: return slice_len(s) else: @@ -71,6 +79,7 @@ cdef class BlockPlacement: cdef: slice s = self._ensure_has_slice() Py_ssize_t start, stop, step, _ + if s is not None: start, stop, step, _ = slice_get_indices_ex(s) return iter(range(start, stop, step)) @@ -81,44 +90,47 @@ cdef class BlockPlacement: def as_slice(self) -> slice: cdef: slice s = self._ensure_has_slice() - if s is None: - raise TypeError('Not slice-like') - else: + + if s is not None: return s + else: + raise TypeError("Not slice-like") @property def indexer(self): cdef: slice s = self._ensure_has_slice() + if s is not None: return s else: return self._as_array - def isin(self, arr): - from pandas.core.indexes.api import Int64Index - return Int64Index(self.as_array, copy=False).isin(arr) - @property - def as_array(self): + def as_array(self) -> np.ndarray: cdef: Py_ssize_t start, stop, end, _ + if not self._has_array: start, stop, step, _ = slice_get_indices_ex(self._as_slice) - self._as_array = np.arange(start, stop, step, - dtype=np.int64) + # NOTE: this is the C-optimized equivalent of + # `np.arange(start, stop, step, dtype=np.int64)` + self._as_array = cnp.PyArray_Arange(start, stop, step, NPY_INT64) self._has_array = True + return self._as_array @property def is_slice_like(self) -> bool: cdef: slice s = self._ensure_has_slice() + return s is not None def __getitem__(self, loc): cdef: slice s = self._ensure_has_slice() + if s is not None: val = slice_getitem(s, loc) else: @@ -129,15 +141,16 @@ cdef class BlockPlacement: return BlockPlacement(val) - def delete(self, loc): + def delete(self, loc) -> "BlockPlacement": return BlockPlacement(np.delete(self.as_array, loc, axis=0)) - def append(self, others): - if len(others) == 0: + def append(self, others) -> "BlockPlacement": + if not len(others): return self - return BlockPlacement(np.concatenate([self.as_array] + - [o.as_array for o in others])) + return BlockPlacement( + np.concatenate([self.as_array] + [o.as_array for o in others]) + ) cdef iadd(self, other): cdef: @@ -155,8 +168,7 @@ cdef class BlockPlacement: start += other_int stop += other_int - if ((step > 0 and start < 0) or - (step < 0 and stop < step)): + if (step > 0 and start < 0) or (step < 0 and stop < step): raise ValueError("iadd causes length change") if stop < 0: @@ -173,16 +185,15 @@ cdef class BlockPlacement: val = newarr return BlockPlacement(val) - def add(self, other): + def add(self, other) -> "BlockPlacement": + # We can get here with int or ndarray return self.iadd(other) - def sub(self, other): - return self.add(-other) - cdef slice _ensure_has_slice(self): if not self._has_slice: self._as_slice = indexer_as_slice(self._as_array) self._has_slice = True + return self._as_slice @@ -232,8 +243,7 @@ cdef slice slice_canonize(slice s): return slice(start, stop, step) -cpdef Py_ssize_t slice_len( - slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX) except -1: +cpdef Py_ssize_t slice_len(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX) except -1: """ Get length of a bounded slice. @@ -250,8 +260,7 @@ cpdef Py_ssize_t slice_len( if slc is None: raise TypeError("slc must be slice") - PySlice_GetIndicesEx(slc, objlen, - &start, &stop, &step, &length) + PySlice_GetIndicesEx(slc, objlen, &start, &stop, &step, &length) return length @@ -269,8 +278,7 @@ cdef slice_get_indices_ex(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX): if slc is None: raise TypeError("slc should be a slice") - PySlice_GetIndicesEx(slc, objlen, - &start, &stop, &step, &length) + PySlice_GetIndicesEx(slc, objlen, &start, &stop, &step, &length) return start, stop, step, length @@ -283,8 +291,7 @@ cdef slice_getitem(slice slc, ind): s_start, s_stop, s_step, s_len = slice_get_indices_ex(slc) if isinstance(ind, slice): - ind_start, ind_stop, ind_step, ind_len = slice_get_indices_ex(ind, - s_len) + ind_start, ind_stop, ind_step, ind_len = slice_get_indices_ex(ind, s_len) if ind_step > 0 and ind_len == s_len: # short-cut for no-op slice @@ -305,7 +312,10 @@ cdef slice_getitem(slice slc, ind): return slice(s_start, s_stop, s_step) else: - return np.arange(s_start, s_stop, s_step, dtype=np.int64)[ind] + # NOTE: + # this is the C-optimized equivalent of + # `np.arange(s_start, s_stop, s_step, dtype=np.int64)[ind]` + return cnp.PyArray_Arange(s_start, s_stop, s_step, NPY_INT64)[ind] @cython.boundscheck(False) @@ -368,67 +378,52 @@ def get_blkno_indexers(int64_t[:] blknos, bint group=True): # blockno handling. cdef: int64_t cur_blkno - Py_ssize_t i, start, stop, n, diff - + Py_ssize_t i, start, stop, n, diff, tot_len object blkno - list group_order - dict group_dict - int64_t[:] res_view + object group_dict = defaultdict(list) n = blknos.shape[0] - - if n == 0: - return - + result = list() start = 0 cur_blkno = blknos[start] - if group is False: + if n == 0: + pass + elif group is False: for i in range(1, n): if blknos[i] != cur_blkno: - yield cur_blkno, slice(start, i) + result.append((cur_blkno, slice(start, i))) start = i cur_blkno = blknos[i] - yield cur_blkno, slice(start, n) + result.append((cur_blkno, slice(start, n))) else: - group_order = [] - group_dict = {} - for i in range(1, n): if blknos[i] != cur_blkno: - if cur_blkno not in group_dict: - group_order.append(cur_blkno) - group_dict[cur_blkno] = [(start, i)] - else: - group_dict[cur_blkno].append((start, i)) + group_dict[cur_blkno].append((start, i)) start = i cur_blkno = blknos[i] - if cur_blkno not in group_dict: - group_order.append(cur_blkno) - group_dict[cur_blkno] = [(start, n)] - else: - group_dict[cur_blkno].append((start, n)) + group_dict[cur_blkno].append((start, n)) - for blkno in group_order: - slices = group_dict[blkno] + for blkno, slices in group_dict.items(): if len(slices) == 1: - yield blkno, slice(slices[0][0], slices[0][1]) + result.append((blkno, slice(slices[0][0], slices[0][1]))) else: tot_len = sum(stop - start for start, stop in slices) - result = np.empty(tot_len, dtype=np.int64) - res_view = result + arr = np.empty(tot_len, dtype=np.int64) i = 0 for start, stop in slices: for diff in range(start, stop): - res_view[i] = diff + arr[i] = diff i += 1 - yield blkno, result + result.append((blkno, arr)) + + return result def get_blkno_placements(blknos, group: bool = True): diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 1166768472449..a47303ddc93cf 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -1,8 +1,16 @@ import numbers from operator import le, lt -from cpython.object cimport (Py_EQ, Py_NE, Py_GT, Py_LT, Py_GE, Py_LE, - PyObject_RichCompare) +from cpython.object cimport ( + Py_EQ, + Py_GE, + Py_GT, + Py_LE, + Py_LT, + Py_NE, + PyObject_RichCompare, +) + import cython from cython import Py_ssize_t @@ -10,9 +18,16 @@ from cython import Py_ssize_t import numpy as np cimport numpy as cnp from numpy cimport ( - int64_t, int32_t, float64_t, float32_t, uint64_t, + NPY_QUICKSORT, + PyArray_ArgSort, + PyArray_Take, + float32_t, + float64_t, + int32_t, + int64_t, ndarray, - PyArray_ArgSort, NPY_QUICKSORT, PyArray_Take) + uint64_t, +) cnp.import_array() @@ -179,7 +194,7 @@ cdef class IntervalMixin: f"expected {repr(self.closed)}.") -cdef _interval_like(other): +cdef bint _interval_like(other): return (hasattr(other, 'left') and hasattr(other, 'right') and hasattr(other, 'closed')) @@ -481,8 +496,7 @@ cdef class Interval(IntervalMixin): @cython.wraparound(False) @cython.boundscheck(False) -def intervals_to_interval_bounds(ndarray intervals, - bint validate_closed=True): +def intervals_to_interval_bounds(ndarray intervals, bint validate_closed=True): """ Parameters ---------- @@ -502,14 +516,14 @@ def intervals_to_interval_bounds(ndarray intervals, """ cdef: object closed = None, interval - int64_t n = len(intervals) + Py_ssize_t i, n = len(intervals) ndarray left, right bint seen_closed = False left = np.empty(n, dtype=intervals.dtype) right = np.empty(n, dtype=intervals.dtype) - for i in range(len(intervals)): + for i in range(n): interval = intervals[i] if interval is None or util.is_nan(interval): left[i] = np.nan diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index dfa7aa708d681..54892a7e4bc77 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -78,7 +78,7 @@ def inner_join(const int64_t[:] left, const int64_t[:] right, @cython.boundscheck(False) def left_outer_join(const int64_t[:] left, const int64_t[:] right, - Py_ssize_t max_groups, sort=True): + Py_ssize_t max_groups, bint sort=True): cdef: Py_ssize_t i, j, k, count = 0 ndarray[int64_t] left_count, right_count, left_sorter, right_sorter @@ -254,6 +254,8 @@ ctypedef fused join_t: float64_t float32_t object + int8_t + int16_t int32_t int64_t uint64_t @@ -668,7 +670,7 @@ def asof_join_backward_on_X_by_Y(asof_t[:] left_values, cdef: Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos ndarray[int64_t] left_indexer, right_indexer - bint has_tolerance = 0 + bint has_tolerance = False asof_t tolerance_ = 0 asof_t diff = 0 HashTable hash_table @@ -676,7 +678,7 @@ def asof_join_backward_on_X_by_Y(asof_t[:] left_values, # if we are using tolerance, set our objects if tolerance is not None: - has_tolerance = 1 + has_tolerance = True tolerance_ = tolerance left_size = len(left_values) @@ -737,7 +739,7 @@ def asof_join_forward_on_X_by_Y(asof_t[:] left_values, cdef: Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos ndarray[int64_t] left_indexer, right_indexer - bint has_tolerance = 0 + bint has_tolerance = False asof_t tolerance_ = 0 asof_t diff = 0 HashTable hash_table @@ -745,7 +747,7 @@ def asof_join_forward_on_X_by_Y(asof_t[:] left_values, # if we are using tolerance, set our objects if tolerance is not None: - has_tolerance = 1 + has_tolerance = True tolerance_ = tolerance left_size = len(left_values) @@ -800,7 +802,7 @@ def asof_join_nearest_on_X_by_Y(asof_t[:] left_values, asof_t[:] right_values, by_t[:] left_by_values, by_t[:] right_by_values, - bint allow_exact_matches=1, + bint allow_exact_matches=True, tolerance=None): cdef: @@ -815,18 +817,22 @@ def asof_join_nearest_on_X_by_Y(asof_t[:] left_values, right_indexer = np.empty(left_size, dtype=np.int64) # search both forward and backward - bli, bri = asof_join_backward_on_X_by_Y(left_values, - right_values, - left_by_values, - right_by_values, - allow_exact_matches, - tolerance) - fli, fri = asof_join_forward_on_X_by_Y(left_values, - right_values, - left_by_values, - right_by_values, - allow_exact_matches, - tolerance) + bli, bri = asof_join_backward_on_X_by_Y( + left_values, + right_values, + left_by_values, + right_by_values, + allow_exact_matches, + tolerance, + ) + fli, fri = asof_join_forward_on_X_by_Y( + left_values, + right_values, + left_by_values, + right_by_values, + allow_exact_matches, + tolerance, + ) for i in range(len(bri)): # choose timestamp from right with smaller difference @@ -847,19 +853,19 @@ def asof_join_nearest_on_X_by_Y(asof_t[:] left_values, def asof_join_backward(asof_t[:] left_values, asof_t[:] right_values, - bint allow_exact_matches=1, + bint allow_exact_matches=True, tolerance=None): cdef: Py_ssize_t left_pos, right_pos, left_size, right_size ndarray[int64_t] left_indexer, right_indexer - bint has_tolerance = 0 + bint has_tolerance = False asof_t tolerance_ = 0 asof_t diff = 0 # if we are using tolerance, set our objects if tolerance is not None: - has_tolerance = 1 + has_tolerance = True tolerance_ = tolerance left_size = len(left_values) @@ -900,19 +906,19 @@ def asof_join_backward(asof_t[:] left_values, def asof_join_forward(asof_t[:] left_values, asof_t[:] right_values, - bint allow_exact_matches=1, + bint allow_exact_matches=True, tolerance=None): cdef: Py_ssize_t left_pos, right_pos, left_size, right_size ndarray[int64_t] left_indexer, right_indexer - bint has_tolerance = 0 + bint has_tolerance = False asof_t tolerance_ = 0 asof_t diff = 0 # if we are using tolerance, set our objects if tolerance is not None: - has_tolerance = 1 + has_tolerance = True tolerance_ = tolerance left_size = len(left_values) @@ -954,7 +960,7 @@ def asof_join_forward(asof_t[:] left_values, def asof_join_nearest(asof_t[:] left_values, asof_t[:] right_values, - bint allow_exact_matches=1, + bint allow_exact_matches=True, tolerance=None): cdef: diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index ca3b83852b098..b5fe73df5d9be 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- from cpython.object cimport PyObject from numpy cimport int64_t, uint64_t, int32_t, uint32_t, float64_t diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7a18429f21a18..6147d6d9c1658 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -4,6 +4,7 @@ from fractions import Fraction from numbers import Number import sys +import warnings import cython from cython import Py_ssize_t @@ -94,22 +95,6 @@ cdef: float64_t NaN = np.NaN -def values_from_object(obj: object): - """ - Return my values or the object if we are say an ndarray. - """ - func: object - - if getattr(obj, '_typ', '') == 'dataframe': - return obj.values - - func = getattr(obj, '_internal_get_values', None) - if func is not None: - obj = func() - - return obj - - @cython.wraparound(False) @cython.boundscheck(False) def memory_usage_of_objects(arr: object[:]) -> int64_t: @@ -302,7 +287,11 @@ def fast_unique_multiple(list arrays, sort: bool = True): try: uniques.sort() except TypeError: - # TODO: RuntimeWarning? + warnings.warn( + "The values in the array are unorderable. " + "Pass `sort=False` to suppress this warning.", + RuntimeWarning, + ) pass return uniques @@ -546,14 +535,14 @@ def maybe_booleans_to_slice(ndarray[uint8_t] mask): cdef: Py_ssize_t i, n = len(mask) Py_ssize_t start = 0, end = 0 - bint started = 0, finished = 0 + bint started = False, finished = False for i in range(n): if mask[i]: if finished: return mask.view(np.bool_) if not started: - started = 1 + started = True start = i else: if finished: @@ -561,7 +550,7 @@ def maybe_booleans_to_slice(ndarray[uint8_t] mask): if started: end = i - finished = 1 + finished = True if not started: return slice(0, 0) @@ -673,13 +662,13 @@ def clean_index_list(obj: list): cdef: Py_ssize_t i, n = len(obj) object val - bint all_arrays = 1 + bint all_arrays = True for i in range(n): val = obj[i] if not (isinstance(val, list) or util.is_array(val) or hasattr(val, '_data')): - all_arrays = 0 + all_arrays = False break if all_arrays: @@ -708,7 +697,7 @@ def clean_index_list(obj: list): @cython.boundscheck(False) @cython.wraparound(False) def generate_bins_dt64(ndarray[int64_t] values, const int64_t[:] binner, - object closed='left', bint hasnans=0): + object closed='left', bint hasnans=False): """ Int64 (datetime64) version of generic python version in ``groupby.py``. """ @@ -809,14 +798,16 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, with nogil: for i in range(n): for j in range(k): - counts[labels[i], j] += mask[i, j] + if mask[i, j]: + counts[labels[i], j] += 1 else: # axis == 1 counts = np.zeros((n, max_bin), dtype='i8') with nogil: for i in range(n): for j in range(k): - counts[i, labels[j]] += mask[i, j] + if mask[i, j]: + counts[i, labels[j]] += 1 return counts @@ -1010,34 +1001,34 @@ cdef inline bint c_is_list_like(object obj, bint allow_sets): _TYPE_MAP = { - 'categorical': 'categorical', - 'category': 'categorical', - 'int8': 'integer', - 'int16': 'integer', - 'int32': 'integer', - 'int64': 'integer', - 'i': 'integer', - 'uint8': 'integer', - 'uint16': 'integer', - 'uint32': 'integer', - 'uint64': 'integer', - 'u': 'integer', - 'float32': 'floating', - 'float64': 'floating', - 'f': 'floating', - 'complex64': 'complex', - 'complex128': 'complex', - 'c': 'complex', - 'string': 'string', - 'S': 'bytes', - 'U': 'string', - 'bool': 'boolean', - 'b': 'boolean', - 'datetime64[ns]': 'datetime64', - 'M': 'datetime64', - 'timedelta64[ns]': 'timedelta64', - 'm': 'timedelta64', - 'interval': 'interval', + "categorical": "categorical", + "category": "categorical", + "int8": "integer", + "int16": "integer", + "int32": "integer", + "int64": "integer", + "i": "integer", + "uint8": "integer", + "uint16": "integer", + "uint32": "integer", + "uint64": "integer", + "u": "integer", + "float32": "floating", + "float64": "floating", + "f": "floating", + "complex64": "complex", + "complex128": "complex", + "c": "complex", + "string": "string", + "S": "bytes", + "U": "string", + "bool": "boolean", + "b": "boolean", + "datetime64[ns]": "datetime64", + "M": "datetime64", + "timedelta64[ns]": "timedelta64", + "m": "timedelta64", + "interval": "interval", } # types only exist on certain platform @@ -1080,29 +1071,29 @@ cdef class Seen: bint timedelta_ # seen_timedelta bint datetimetz_ # seen_datetimetz - def __cinit__(self, bint coerce_numeric=0): + def __cinit__(self, bint coerce_numeric=False): """ Initialize a Seen instance. Parameters ---------- - coerce_numeric : bint, default 0 + coerce_numeric : bool, default False Whether or not to force conversion to a numeric data type if initial methods to convert to numeric fail. """ - self.int_ = 0 - self.nat_ = 0 - self.bool_ = 0 - self.null_ = 0 - self.nan_ = 0 - self.uint_ = 0 - self.sint_ = 0 - self.float_ = 0 - self.object_ = 0 - self.complex_ = 0 - self.datetime_ = 0 - self.timedelta_ = 0 - self.datetimetz_ = 0 + self.int_ = False + self.nat_ = False + self.bool_ = False + self.null_ = False + self.nan_ = False + self.uint_ = False + self.sint_ = False + self.float_ = False + self.object_ = False + self.complex_ = False + self.datetime_ = False + self.timedelta_ = False + self.datetimetz_ = False self.coerce_numeric = coerce_numeric cdef inline bint check_uint64_conflict(self) except -1: @@ -1143,8 +1134,8 @@ cdef class Seen: """ Set flags indicating that a null value was encountered. """ - self.null_ = 1 - self.float_ = 1 + self.null_ = True + self.float_ = True cdef saw_int(self, object val): """ @@ -1163,7 +1154,7 @@ cdef class Seen: val : Python int Value with which to set the flags. """ - self.int_ = 1 + self.int_ = True self.sint_ = self.sint_ or (oINT64_MIN <= val < 0) self.uint_ = self.uint_ or (oINT64_MAX < val <= oUINT64_MAX) @@ -1182,12 +1173,13 @@ cdef class Seen: or self.nat_) -cdef _try_infer_map(v): +cdef object _try_infer_map(object v): """ If its in our map, just return the dtype. """ cdef: - object attr, val + object val + str attr for attr in ['name', 'kind', 'base']: val = getattr(v.dtype, attr) if val in _TYPE_MAP: @@ -1206,8 +1198,6 @@ def infer_dtype(value: object, skipna: bool = True) -> str: skipna : bool, default True Ignore NaN values when inferring the type. - .. versionadded:: 0.21.0 - Returns ------- str @@ -1461,9 +1451,9 @@ def infer_datetimelike_array(arr: object) -> object: """ cdef: Py_ssize_t i, n = len(arr) - bint seen_timedelta = 0, seen_date = 0, seen_datetime = 0 - bint seen_tz_aware = 0, seen_tz_naive = 0 - bint seen_nat = 0 + bint seen_timedelta = False, seen_date = False, seen_datetime = False + bint seen_tz_aware = False, seen_tz_naive = False + bint seen_nat = False list objs = [] object v @@ -1479,27 +1469,27 @@ def infer_datetimelike_array(arr: object) -> object: # nan or None pass elif v is NaT: - seen_nat = 1 + seen_nat = True elif PyDateTime_Check(v): # datetime - seen_datetime = 1 + seen_datetime = True # disambiguate between tz-naive and tz-aware if v.tzinfo is None: - seen_tz_naive = 1 + seen_tz_naive = True else: - seen_tz_aware = 1 + seen_tz_aware = True if seen_tz_naive and seen_tz_aware: return 'mixed' elif util.is_datetime64_object(v): # np.datetime64 - seen_datetime = 1 + seen_datetime = True elif PyDate_Check(v): - seen_date = 1 + seen_date = True elif is_timedelta(v): # timedelta, or timedelta64 - seen_timedelta = 1 + seen_timedelta = True else: return "mixed" @@ -2026,8 +2016,6 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, except (TypeError, ValueError) as err: if not seen.coerce_numeric: raise type(err)(f"{err} at position {i}") - elif "uint64" in str(err): # Exception from check functions. - raise seen.saw_null() floats[i] = NaN @@ -2053,10 +2041,10 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, @cython.boundscheck(False) @cython.wraparound(False) -def maybe_convert_objects(ndarray[object] objects, bint try_float=0, - bint safe=0, bint convert_datetime=0, - bint convert_timedelta=0, - bint convert_to_nullable_integer=0): +def maybe_convert_objects(ndarray[object] objects, bint try_float=False, + bint safe=False, bint convert_datetime=False, + bint convert_timedelta=False, + bint convert_to_nullable_integer=False): """ Type inference function-- convert object array to proper dtype @@ -2077,7 +2065,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, If an array-like object contains only timedelta values or NaT is encountered, whether to convert and return an array of m8[ns] dtype. convert_to_nullable_integer : bool, default False - If an array-like object contains only interger values (and NaN) is + If an array-like object contains only integer values (and NaN) is encountered, whether to convert and return an IntegerArray. Returns @@ -2120,45 +2108,45 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, val = objects[i] if val is None: - seen.null_ = 1 + seen.null_ = True floats[i] = complexes[i] = fnan mask[i] = True elif val is NaT: - seen.nat_ = 1 + seen.nat_ = True if convert_datetime: idatetimes[i] = NPY_NAT if convert_timedelta: itimedeltas[i] = NPY_NAT if not (convert_datetime or convert_timedelta): - seen.object_ = 1 + seen.object_ = True break elif val is np.nan: - seen.nan_ = 1 + seen.nan_ = True mask[i] = True floats[i] = complexes[i] = val elif util.is_bool_object(val): - seen.bool_ = 1 + seen.bool_ = True bools[i] = val elif util.is_float_object(val): floats[i] = complexes[i] = val - seen.float_ = 1 + seen.float_ = True elif util.is_datetime64_object(val): if convert_datetime: idatetimes[i] = convert_to_tsobject( val, None, None, 0, 0).value - seen.datetime_ = 1 + seen.datetime_ = True else: - seen.object_ = 1 + seen.object_ = True break elif is_timedelta(val): if convert_timedelta: itimedeltas[i] = convert_to_timedelta64(val, 'ns') - seen.timedelta_ = 1 + seen.timedelta_ = True else: - seen.object_ = 1 + seen.object_ = True break elif util.is_integer_object(val): - seen.int_ = 1 + seen.int_ = True floats[i] = val complexes[i] = val if not seen.null_: @@ -2167,7 +2155,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if ((seen.uint_ and seen.sint_) or val > oUINT64_MAX or val < oINT64_MIN): - seen.object_ = 1 + seen.object_ = True break if seen.uint_: @@ -2180,32 +2168,32 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, elif util.is_complex_object(val): complexes[i] = val - seen.complex_ = 1 + seen.complex_ = True elif PyDateTime_Check(val) or util.is_datetime64_object(val): # if we have an tz's attached then return the objects if convert_datetime: if getattr(val, 'tzinfo', None) is not None: - seen.datetimetz_ = 1 + seen.datetimetz_ = True break else: - seen.datetime_ = 1 + seen.datetime_ = True idatetimes[i] = convert_to_tsobject( val, None, None, 0, 0).value else: - seen.object_ = 1 + seen.object_ = True break elif try_float and not isinstance(val, str): # this will convert Decimal objects try: floats[i] = float(val) complexes[i] = complex(val) - seen.float_ = 1 + seen.float_ = True except (ValueError, TypeError): - seen.object_ = 1 + seen.object_ = True break else: - seen.object_ = 1 + seen.object_ = True break # we try to coerce datetime w/tz but must all have the same tz @@ -2213,7 +2201,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if is_datetime_with_singletz_array(objects): from pandas import DatetimeIndex return DatetimeIndex(objects) - seen.object_ = 1 + seen.object_ = True if not seen.object_: if not safe: @@ -2300,7 +2288,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, return uints else: return ints - elif seen.is_bool: + elif seen.is_bool and not seen.nan_: return bools.view(np.bool_) return objects @@ -2312,7 +2300,7 @@ no_default = object() #: Sentinel indicating the default value. @cython.boundscheck(False) @cython.wraparound(False) -def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1, +def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=True, object na_value=no_default, object dtype=object): """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2361,16 +2349,16 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1, if convert: return maybe_convert_objects(result, - try_float=0, - convert_datetime=0, - convert_timedelta=0) + try_float=False, + convert_datetime=False, + convert_timedelta=False) return result @cython.boundscheck(False) @cython.wraparound(False) -def map_infer(ndarray arr, object f, bint convert=1): +def map_infer(ndarray arr, object f, bint convert=True): """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2403,9 +2391,9 @@ def map_infer(ndarray arr, object f, bint convert=1): if convert: return maybe_convert_objects(result, - try_float=0, - convert_datetime=0, - convert_timedelta=0) + try_float=False, + convert_datetime=False, + convert_timedelta=False) return result diff --git a/pandas/_libs/missing.pxd b/pandas/_libs/missing.pxd index d4303ac28b9a5..5ab42a736712f 100644 --- a/pandas/_libs/missing.pxd +++ b/pandas/_libs/missing.pxd @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - from numpy cimport ndarray, uint8_t cpdef bint checknull(object val) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 4d17a6f883c1c..dacf454824190 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -10,10 +10,13 @@ cnp.import_array() cimport pandas._libs.util as util -from pandas._libs.tslibs.np_datetime cimport ( - get_timedelta64_value, get_datetime64_value) + +from pandas._libs.tslibs.np_datetime cimport get_datetime64_value, get_timedelta64_value from pandas._libs.tslibs.nattype cimport ( - checknull_with_nat, c_NaT as NaT, is_null_datetimelike) + c_NaT as NaT, + checknull_with_nat, + is_null_datetimelike, +) from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op from pandas.compat import is_platform_32bit @@ -44,7 +47,7 @@ cpdef bint checknull(object val): Returns ------- - result : bool + bool Notes ----- @@ -223,7 +226,7 @@ def isnaobj2d_old(arr: ndarray) -> ndarray: Returns ------- - result : ndarray (dtype=np.bool_) + ndarray (dtype=np.bool_) Notes ----- @@ -248,17 +251,11 @@ def isnaobj2d_old(arr: ndarray) -> ndarray: def isposinf_scalar(val: object) -> bool: - if util.is_float_object(val) and val == INF: - return True - else: - return False + return util.is_float_object(val) and val == INF def isneginf_scalar(val: object) -> bool: - if util.is_float_object(val) and val == NEGINF: - return True - else: - return False + return util.is_float_object(val) and val == NEGINF cdef inline bint is_null_datetime64(v): @@ -364,6 +361,9 @@ class NAType(C_NAType): exponent = 31 if is_32bit else 61 return 2 ** exponent - 1 + def __reduce__(self): + return "NA" + # Binary arithmetic and comparison ops -> propagate __add__ = _create_binary_propagating_op("__add__") @@ -423,7 +423,6 @@ class NAType(C_NAType): return NA elif isinstance(other, np.ndarray): return np.where(other == 1, other, NA) - return NotImplemented # Logical ops using Kleene logic @@ -433,8 +432,7 @@ class NAType(C_NAType): return False elif other is True or other is C_NA: return NA - else: - return NotImplemented + return NotImplemented __rand__ = __and__ @@ -443,8 +441,7 @@ class NAType(C_NAType): return True elif other is False or other is C_NA: return NA - else: - return NotImplemented + return NotImplemented __ror__ = __or__ diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx index abe1484e3763d..658600cdfbe6c 100644 --- a/pandas/_libs/ops.pyx +++ b/pandas/_libs/ops.pyx @@ -1,7 +1,15 @@ import operator -from cpython.object cimport (PyObject_RichCompareBool, - Py_EQ, Py_NE, Py_LT, Py_LE, Py_GT, Py_GE) +from cpython.object cimport ( + Py_EQ, + Py_GE, + Py_GT, + Py_LE, + Py_LT, + Py_NE, + PyObject_RichCompareBool, +) + import cython from cython import Py_ssize_t @@ -100,7 +108,7 @@ def scalar_compare(object[:] values, object val, object op): @cython.wraparound(False) @cython.boundscheck(False) -def vec_compare(object[:] left, object[:] right, object op): +def vec_compare(ndarray[object] left, ndarray[object] right, object op): """ Compare the elements of `left` with the elements of `right` pointwise, with the comparison operation described by `op`. diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 3077f73a8d1a4..e4aeb7ad69792 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -34,6 +34,7 @@ cimport numpy as cnp from numpy cimport ndarray, uint8_t, uint64_t, int64_t, float64_t cnp.import_array() +cimport pandas._libs.util as util from pandas._libs.util cimport UINT64_MAX, INT64_MAX, INT64_MIN import pandas._libs.lib as lib @@ -241,9 +242,9 @@ cdef extern from "parser/io.h": void* buffer_mmap_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status) - void *new_file_source(char *fname, size_t buffer_size) + void *new_file_source(char *fname, size_t buffer_size) except NULL - void *new_rd_source(object obj) + void *new_rd_source(object obj) except NULL int del_file_source(void *src) int del_rd_source(void *src) @@ -267,7 +268,7 @@ cdef class TextReader: cdef: parser_t *parser - object file_handle, na_fvalues + object na_fvalues object true_values, false_values object handle bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns @@ -279,18 +280,16 @@ cdef class TextReader: cdef public: int64_t leading_cols, table_width, skipfooter, buffer_lines - object allow_leading_cols - object delimiter, converters, delim_whitespace + bint allow_leading_cols, mangle_dupe_cols, memory_map, low_memory + bint delim_whitespace + object delimiter, converters object na_values - object memory_map object header, orig_header, names, header_start, header_end object index_col - object low_memory object skiprows object dtype object encoding object compression - object mangle_dupe_cols object usecols list dtype_cast_order set unnamed_cols @@ -298,54 +297,44 @@ cdef class TextReader: def __cinit__(self, source, delimiter=b',', - header=0, header_start=0, header_end=0, index_col=None, names=None, - - memory_map=False, + bint memory_map=False, tokenize_chunksize=DEFAULT_CHUNKSIZE, - delim_whitespace=False, - + bint delim_whitespace=False, compression=None, - converters=None, - - skipinitialspace=False, + bint skipinitialspace=False, escapechar=None, - doublequote=True, + bint doublequote=True, quotechar=b'"', quoting=0, lineterminator=None, - encoding=None, - comment=None, decimal=b'.', thousands=None, - dtype=None, usecols=None, - error_bad_lines=True, - warn_bad_lines=True, - - na_filter=True, + bint error_bad_lines=True, + bint warn_bad_lines=True, + bint na_filter=True, na_values=None, na_fvalues=None, - keep_default_na=True, - + bint keep_default_na=True, true_values=None, false_values=None, - allow_leading_cols=True, - low_memory=False, + bint allow_leading_cols=True, + bint low_memory=False, skiprows=None, skipfooter=0, - verbose=False, - mangle_dupe_cols=True, + bint verbose=False, + bint mangle_dupe_cols=True, float_precision=None, - skip_blank_lines=True): + bint skip_blank_lines=True): # set encoding for native Python and C library if encoding is not None: @@ -591,7 +580,7 @@ cdef class TextReader: self.parser.quotechar = ord(quote_char) cdef _make_skiprow_set(self): - if isinstance(self.skiprows, (int, np.integer)): + if util.is_integer_object(self.skiprows): parser_set_skipfirstnrows(self.parser, self.skiprows) elif not callable(self.skiprows): for i in self.skiprows: @@ -601,7 +590,6 @@ cdef class TextReader: cdef _setup_parser_source(self, source): cdef: - int status void *ptr self.parser.cb_io = NULL @@ -638,7 +626,8 @@ cdef class TextReader: raise ValueError(f'Unrecognized compression type: ' f'{self.compression}') - if self.encoding and isinstance(source, (io.BufferedIOBase, io.RawIOBase)): + if (self.encoding and hasattr(source, "read") and + not hasattr(source, "encoding")): source = io.TextIOWrapper( source, self.encoding.decode('utf-8'), newline='') @@ -666,26 +655,12 @@ cdef class TextReader: ptr = new_file_source(source, self.parser.chunksize) self.parser.cb_io = &buffer_file_bytes self.parser.cb_cleanup = &del_file_source - - if ptr == NULL: - if not os.path.exists(source): - - raise FileNotFoundError( - ENOENT, - f'File {usource} does not exist', - usource) - raise IOError('Initializing from file failed') - self.parser.source = ptr elif hasattr(source, 'read'): # e.g., StringIO ptr = new_rd_source(source) - if ptr == NULL: - raise IOError('Initializing parser from file-like ' - 'object failed') - self.parser.source = ptr self.parser.cb_io = &buffer_rd_bytes self.parser.cb_cleanup = &del_rd_source @@ -697,16 +672,14 @@ cdef class TextReader: # header is now a list of lists, so field_count should use header[0] cdef: - Py_ssize_t i, start, field_count, passed_count, unnamed_count + Py_ssize_t i, start, field_count, passed_count, unnamed_count, level char *word object name, old_name - int status - uint64_t hr, data_line + uint64_t hr, data_line = 0 char *errors = "strict" StringPath path = _string_path(self.c_encoding) - - header = [] - unnamed_cols = set() + list header = [] + set unnamed_cols = set() if self.parser.header_start >= 0: @@ -805,7 +778,6 @@ cdef class TextReader: self._tokenize_rows(1) header = [ self.names ] - data_line = 0 if self.parser.lines < 1: field_count = len(header[0]) @@ -851,9 +823,6 @@ cdef class TextReader: """ rows=None --> read all rows """ - cdef: - int status - if self.low_memory: # Conserve intermediate space columns = self._read_low_memory(rows) @@ -866,7 +835,7 @@ cdef class TextReader: cdef _read_low_memory(self, rows): cdef: size_t rows_read = 0 - chunks = [] + list chunks = [] if rows is None: while True: @@ -902,7 +871,9 @@ cdef class TextReader: return _concatenate_chunks(chunks) cdef _tokenize_rows(self, size_t nrows): - cdef int status + cdef: + int status + with nogil: status = tokenize_nrows(self.parser, nrows) @@ -1330,8 +1301,8 @@ cdef class TextReader: else: if self.header is not None: j = i - self.leading_cols - # hack for #2442 - if j == len(self.header[0]): + # generate extra (bogus) headers if there are more columns than headers + if j >= len(self.header[0]): return j else: return self.header[0][j] @@ -1345,7 +1316,8 @@ cdef: def _ensure_encoded(list lst): - cdef list result = [] + cdef: + list result = [] for x in lst: if isinstance(x, str): x = PyUnicode_AsUTF8String(x) @@ -1472,7 +1444,7 @@ cdef _string_box_decode(parser_t *parser, int64_t col, bint na_filter, kh_str_starts_t *na_hashset, char *encoding): cdef: - int error, na_count = 0 + int na_count = 0 Py_ssize_t i, size, lines coliter_t it const char *word = NULL @@ -1531,7 +1503,7 @@ cdef _categorical_convert(parser_t *parser, int64_t col, char *encoding): "Convert column data into codes, categories" cdef: - int error, na_count = 0 + int na_count = 0 Py_ssize_t i, size, lines coliter_t it const char *word = NULL @@ -1595,9 +1567,6 @@ cdef _categorical_convert(parser_t *parser, int64_t col, cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, int64_t width): cdef: - Py_ssize_t i - coliter_t it - const char *word = NULL char *data ndarray result @@ -1641,15 +1610,11 @@ cdef _try_double(parser_t *parser, int64_t col, bint na_filter, kh_str_starts_t *na_hashset, object na_flist): cdef: int error, na_count = 0 - Py_ssize_t i, lines - coliter_t it - const char *word = NULL - char *p_end + Py_ssize_t lines float64_t *data float64_t NA = na_values[np.float64] kh_float64_t *na_fset ndarray result - khiter_t k bint use_na_flist = len(na_flist) > 0 lines = line_end - line_start @@ -1684,7 +1649,7 @@ cdef inline int _try_double_nogil(parser_t *parser, coliter_t it const char *word = NULL char *p_end - khiter_t k, k64 + khiter_t k64 na_count[0] = 0 coliter_setup(&it, parser, col, line_start) @@ -1747,11 +1712,10 @@ cdef _try_uint64(parser_t *parser, int64_t col, bint na_filter, kh_str_starts_t *na_hashset): cdef: int error - Py_ssize_t i, lines + Py_ssize_t lines coliter_t it uint64_t *data ndarray result - khiter_t k uint_state state lines = line_end - line_start @@ -1788,7 +1752,6 @@ cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col, Py_ssize_t i, lines = line_end - line_start coliter_t it const char *word = NULL - khiter_t k coliter_setup(&it, parser, col, line_start) @@ -1821,13 +1784,11 @@ cdef _try_int64(parser_t *parser, int64_t col, bint na_filter, kh_str_starts_t *na_hashset): cdef: int error, na_count = 0 - Py_ssize_t i, lines + Py_ssize_t lines coliter_t it int64_t *data ndarray result - int64_t NA = na_values[np.int64] - khiter_t k lines = line_end - line_start result = np.empty(lines, dtype=np.int64) @@ -1855,7 +1816,6 @@ cdef inline int _try_int64_nogil(parser_t *parser, int64_t col, Py_ssize_t i, lines = line_end - line_start coliter_t it const char *word = NULL - khiter_t k na_count[0] = 0 coliter_setup(&it, parser, col, line_start) @@ -1891,14 +1851,10 @@ cdef _try_bool_flex(parser_t *parser, int64_t col, const kh_str_starts_t *false_hashset): cdef: int error, na_count = 0 - Py_ssize_t i, lines - coliter_t it - const char *word = NULL + Py_ssize_t lines uint8_t *data ndarray result - uint8_t NA = na_values[np.bool_] - khiter_t k lines = line_end - line_start result = np.empty(lines, dtype=np.uint8) @@ -1925,7 +1881,6 @@ cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col, Py_ssize_t i, lines = line_end - line_start coliter_t it const char *word = NULL - khiter_t k na_count[0] = 0 coliter_setup(&it, parser, col, line_start) @@ -1980,10 +1935,8 @@ cdef kh_str_starts_t* kset_from_list(list values) except NULL: # caller takes responsibility for freeing the hash table cdef: Py_ssize_t i - khiter_t k kh_str_starts_t *table int ret = 0 - object val table = kh_init_str_starts() @@ -2011,7 +1964,6 @@ cdef kh_str_starts_t* kset_from_list(list values) except NULL: cdef kh_float64_t* kset_float64_from_list(values) except NULL: # caller takes responsibility for freeing the hash table cdef: - Py_ssize_t i khiter_t k kh_float64_t *table int ret = 0 @@ -2074,12 +2026,11 @@ def _concatenate_chunks(list chunks): cdef: list names = list(chunks[0].keys()) object name - list warning_columns + list warning_columns = [] object warning_names object common_type result = {} - warning_columns = list() for name in names: arrs = [chunk.pop(name) for chunk in chunks] # Check each arr for consistent types. @@ -2149,7 +2100,6 @@ cdef _apply_converter(object f, parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, char* c_encoding): cdef: - int error Py_ssize_t i, lines coliter_t it const char *word = NULL @@ -2184,7 +2134,7 @@ def _maybe_encode(values): def sanitize_objects(ndarray[object] values, set na_values, - convert_empty=True): + bint convert_empty=True): """ Convert specified values, including the given set na_values and empty strings if convert_empty is True, to np.nan. @@ -2193,7 +2143,7 @@ def sanitize_objects(ndarray[object] values, set na_values, ---------- values : ndarray[object] na_values : set - convert_empty : bool (default True) + convert_empty : bool, default True """ cdef: Py_ssize_t i, n diff --git a/pandas/_libs/properties.pyx b/pandas/_libs/properties.pyx index 857119789ab45..9b936eed785b4 100644 --- a/pandas/_libs/properties.pyx +++ b/pandas/_libs/properties.pyx @@ -1,7 +1,6 @@ from cython import Py_ssize_t -from cpython.dict cimport ( - PyDict_Contains, PyDict_GetItem, PyDict_SetItem) +from cpython.dict cimport PyDict_Contains, PyDict_GetItem, PyDict_SetItem cdef class CachedProperty: @@ -57,10 +56,10 @@ cdef class AxisProperty: list axes if obj is None: - # Only instances have _data, not classes + # Only instances have _mgr, not classes return self else: - axes = obj._data.axes + axes = obj._mgr.axes return axes[self.axis] def __set__(self, obj, value): diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index b27072aa66708..a7b2d5d5491d5 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -1,5 +1,4 @@ from copy import copy -from distutils.version import LooseVersion from cython import Py_ssize_t from cpython.ref cimport Py_INCREF @@ -37,7 +36,12 @@ cdef class Reducer: object dummy, f, labels, typ, ityp, index ndarray arr - def __init__(self, ndarray arr, object f, axis=1, dummy=None, labels=None): + def __init__( + self, ndarray arr, object f, int axis=1, object dummy=None, object labels=None + ): + cdef: + Py_ssize_t n, k + n, k = (arr).shape if axis == 0: @@ -61,7 +65,7 @@ cdef class Reducer: self.dummy, self.typ, self.index, self.ityp = self._check_dummy( dummy=dummy) - cdef _check_dummy(self, dummy=None): + cdef _check_dummy(self, object dummy=None): cdef: object index = None, typ = None, ityp = None @@ -123,7 +127,7 @@ cdef class Reducer: name = labels[i] object.__setattr__( - cached_typ._data._block, 'values', chunk) + cached_typ._mgr._block, 'values', chunk) object.__setattr__(cached_typ, 'name', name) res = self.f(cached_typ) else: @@ -148,7 +152,7 @@ cdef class Reducer: cdef class _BaseGrouper: - cdef _check_dummy(self, dummy): + cdef _check_dummy(self, object dummy): # both values and index must be an ndarray! values = dummy.values @@ -176,7 +180,9 @@ cdef class _BaseGrouper: # to a 1-d ndarray like datetime / timedelta / period. object.__setattr__(cached_ityp, '_index_data', islider.buf) cached_ityp._engine.clear_mapping() - object.__setattr__(cached_typ._data._block, 'values', vslider.buf) + object.__setattr__(cached_typ._mgr._block, 'values', vslider.buf) + object.__setattr__(cached_typ._mgr._block, 'mgr_locs', + slice(len(vslider.buf))) object.__setattr__(cached_typ, '_index', cached_ityp) object.__setattr__(cached_typ, 'name', self.name) @@ -189,13 +195,16 @@ cdef class _BaseGrouper: """ Call self.f on our new group, then update to the next group. """ + cdef: + object res + cached_ityp._engine.clear_mapping() res = self.f(cached_typ) res = _extract_result(res) if not initialized: # On the first pass, we check the output shape to see # if this looks like a reduction. - initialized = 1 + initialized = True _check_result_array(res, len(self.dummy_arr)) islider.advance(group_size) @@ -533,12 +542,16 @@ cdef class BlockSlider: cdef: char **base_ptrs - def __init__(self, frame): + def __init__(self, object frame): + cdef: + Py_ssize_t i + object b + self.frame = frame self.dummy = frame[:0] self.index = self.dummy.index - self.blocks = [b.values for b in self.dummy._data.blocks] + self.blocks = [b.values for b in self.dummy._mgr.blocks] for x in self.blocks: util.set_array_not_contiguous(x) diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index e74b5919a4590..aed5e1d612088 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -36,7 +36,7 @@ ctypedef fused reshape_t: @cython.wraparound(False) @cython.boundscheck(False) -def unstack(reshape_t[:, :] values, uint8_t[:] mask, +def unstack(reshape_t[:, :] values, const uint8_t[:] mask, Py_ssize_t stride, Py_ssize_t length, Py_ssize_t width, reshape_t[:, :] new_values, uint8_t[:, :] new_mask): """ diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index 091ca42cb71dd..d853ddf3de7d4 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -34,18 +34,21 @@ cdef class IntIndex(SparseIndex): length : integer indices : array-like Contains integers corresponding to the indices. + check_integrity : bool, default=True + Check integrity of the input. """ cdef readonly: Py_ssize_t length, npoints ndarray indices - def __init__(self, Py_ssize_t length, indices): + def __init__(self, Py_ssize_t length, indices, bint check_integrity=True): self.length = length self.indices = np.ascontiguousarray(indices, dtype=np.int32) self.npoints = len(self.indices) - self.check_integrity() + if check_integrity: + self.check_integrity() def __reduce__(self): args = (self.length, self.indices) diff --git a/pandas/_libs/src/parse_helper.h b/pandas/_libs/src/parse_helper.h index 7fbe7a04d5b22..2ada0a4bd173d 100644 --- a/pandas/_libs/src/parse_helper.h +++ b/pandas/_libs/src/parse_helper.h @@ -34,6 +34,9 @@ int floatify(PyObject *str, double *result, int *maybe_int) { data = PyBytes_AS_STRING(str); } else if (PyUnicode_Check(str)) { tmp = PyUnicode_AsUTF8String(str); + if (tmp == NULL) { + return -1; + } data = PyBytes_AS_STRING(tmp); } else { PyErr_SetString(PyExc_TypeError, "Invalid object type"); diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index 1e3295fcb6fc7..51504527de5a2 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -28,6 +28,7 @@ The full license is in the LICENSE file, distributed with this software. void *new_file_source(char *fname, size_t buffer_size) { file_source *fs = (file_source *)malloc(sizeof(file_source)); if (fs == NULL) { + PyErr_NoMemory(); return NULL; } @@ -41,17 +42,20 @@ void *new_file_source(char *fname, size_t buffer_size) { int required = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0); if (required == 0) { free(fs); + PyErr_SetFromWindowsErr(0); return NULL; } wname = (wchar_t*)malloc(required * sizeof(wchar_t)); if (wname == NULL) { free(fs); + PyErr_NoMemory(); return NULL; } if (MultiByteToWideChar(CP_UTF8, 0, fname, -1, wname, required) < required) { free(wname); free(fs); + PyErr_SetFromWindowsErr(0); return NULL; } fs->fd = _wopen(wname, O_RDONLY | O_BINARY); @@ -62,6 +66,7 @@ void *new_file_source(char *fname, size_t buffer_size) { #endif if (fs->fd == -1) { free(fs); + PyErr_SetFromErrnoWithFilename(PyExc_OSError, fname); return NULL; } @@ -71,6 +76,7 @@ void *new_file_source(char *fname, size_t buffer_size) { if (fs->buffer == NULL) { close(fs->fd); free(fs); + PyErr_NoMemory(); return NULL; } @@ -83,6 +89,10 @@ void *new_file_source(char *fname, size_t buffer_size) { void *new_rd_source(PyObject *obj) { rd_source *rds = (rd_source *)malloc(sizeof(rd_source)); + if (rds == NULL) { + PyErr_NoMemory(); + return NULL; + } /* hold on to this object */ Py_INCREF(obj); rds->obj = obj; @@ -220,20 +230,15 @@ void *new_mmap(char *fname) { mm = (memory_map *)malloc(sizeof(memory_map)); if (mm == NULL) { - fprintf(stderr, "new_file_buffer: malloc() failed.\n"); - return (NULL); + return NULL; } mm->fd = open(fname, O_RDONLY | O_BINARY); if (mm->fd == -1) { - fprintf(stderr, "new_file_buffer: open(%s) failed. errno =%d\n", - fname, errno); free(mm); return NULL; } if (fstat(mm->fd, &stat) == -1) { - fprintf(stderr, "new_file_buffer: fstat() failed. errno =%d\n", - errno); close(mm->fd); free(mm); return NULL; @@ -242,8 +247,6 @@ void *new_mmap(char *fname) { mm->memmap = mmap(NULL, filesize, PROT_READ, MAP_SHARED, mm->fd, 0); if (mm->memmap == MAP_FAILED) { - /* XXX Eventually remove this print statement. */ - fprintf(stderr, "new_file_buffer: mmap() failed.\n"); close(mm->fd); free(mm); return NULL; diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 2188ff6b0d464..7ba1a6cd398c9 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1189,8 +1189,14 @@ int parser_consume_rows(parser_t *self, size_t nrows) { /* cannot guarantee that nrows + 1 has been observed */ word_deletions = self->line_start[nrows - 1] + self->line_fields[nrows - 1]; - char_count = (self->word_starts[word_deletions - 1] + - strlen(self->words[word_deletions - 1]) + 1); + if (word_deletions >= 1) { + char_count = (self->word_starts[word_deletions - 1] + + strlen(self->words[word_deletions - 1]) + 1); + } else { + /* if word_deletions == 0 (i.e. this case) then char_count must + * be 0 too, as no data needs to be skipped */ + char_count = 0; + } TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions, char_count)); diff --git a/pandas/_libs/src/ujson/python/date_conversions.c b/pandas/_libs/src/ujson/python/date_conversions.c index fc4bdef8463af..4c25ab572bebe 100644 --- a/pandas/_libs/src/ujson/python/date_conversions.c +++ b/pandas/_libs/src/ujson/python/date_conversions.c @@ -67,7 +67,7 @@ npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base) { } /* Convert PyDatetime To ISO C-string. mutates len */ -char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base, +char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base, size_t *len) { npy_datetimestruct dts; int ret; @@ -98,7 +98,7 @@ char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base, return result; } -npy_datetime PyDateTimeToEpoch(PyDateTime_Date *dt, NPY_DATETIMEUNIT base) { +npy_datetime PyDateTimeToEpoch(PyObject *dt, NPY_DATETIMEUNIT base) { npy_datetimestruct dts; int ret; @@ -116,3 +116,29 @@ npy_datetime PyDateTimeToEpoch(PyDateTime_Date *dt, NPY_DATETIMEUNIT base) { npy_datetime npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts); return NpyDateTimeToEpoch(npy_dt, base); } + +/* Converts the int64_t representation of a duration to ISO; mutates len */ +char *int64ToIsoDuration(int64_t value, size_t *len) { + pandas_timedeltastruct tds; + int ret_code; + + pandas_timedelta_to_timedeltastruct(value, NPY_FR_ns, &tds); + + // Max theoretical length of ISO Duration with 64 bit day + // as the largest unit is 70 characters + 1 for a null terminator + char *result = PyObject_Malloc(71); + if (result == NULL) { + PyErr_NoMemory(); + return NULL; + } + + ret_code = make_iso_8601_timedelta(&tds, result, len); + if (ret_code == -1) { + PyErr_SetString(PyExc_ValueError, + "Could not convert timedelta value to string"); + PyObject_Free(result); + return NULL; + } + + return result; +} diff --git a/pandas/_libs/src/ujson/python/date_conversions.h b/pandas/_libs/src/ujson/python/date_conversions.h index 45455f4d6128b..23e36999be43f 100644 --- a/pandas/_libs/src/ujson/python/date_conversions.h +++ b/pandas/_libs/src/ujson/python/date_conversions.h @@ -4,7 +4,6 @@ #define PY_SSIZE_T_CLEAN #include #include -#include "datetime.h" // Scales value inplace from nanosecond resolution to unit resolution int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit); @@ -23,9 +22,11 @@ npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base); // up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z // while base="ns" yields "2020-01-01T00:00:00.000000000Z" // len is mutated to save the length of the returned string -char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base, size_t *len); +char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base, size_t *len); // Convert a Python Date/Datetime to Unix epoch with resolution base -npy_datetime PyDateTimeToEpoch(PyDateTime_Date *dt, NPY_DATETIMEUNIT base); +npy_datetime PyDateTimeToEpoch(PyObject *dt, NPY_DATETIMEUNIT base); + +char *int64ToIsoDuration(int64_t value, size_t *len); #endif diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 8cfc20ffd2c1c..0eae7a36a29c3 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -165,7 +165,6 @@ void *initObjToJSON(void) { cls_index = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Index"); cls_series = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Series"); - cls_timedelta = PyObject_GetAttrString(mod_pandas, "Timedelta"); Py_DECREF(mod_pandas); } @@ -222,28 +221,19 @@ static PyObject *get_values(PyObject *obj) { PRINTMARK(); - if (PyObject_HasAttrString(obj, "_internal_get_values")) { + if (PyObject_TypeCheck(obj, cls_index) || PyObject_TypeCheck(obj, cls_series)) { + // The special cases to worry about are dt64tz and category[dt64tz]. + // In both cases we want the UTC-localized datetime64 ndarray, + // without going through and object array of Timestamps. PRINTMARK(); - values = PyObject_CallMethod(obj, "_internal_get_values", NULL); - - if (values == NULL) { - // Clear so we can subsequently try another method - PyErr_Clear(); - } else if (!PyArray_CheckExact(values)) { - // Didn't get a numpy array, so keep trying - PRINTMARK(); - Py_DECREF(values); - values = NULL; - } - } - - if ((values == NULL) && PyObject_HasAttrString(obj, "get_block_values")) { - PRINTMARK(); - values = PyObject_CallMethod(obj, "get_block_values", NULL); + values = PyObject_GetAttrString(obj, "values"); if (values == NULL) { // Clear so we can subsequently try another method PyErr_Clear(); + } else if (PyObject_HasAttrString(values, "__array__")) { + // We may have gotten a Categorical or Sparse array so call np.array + values = PyObject_CallMethod(values, "__array__", NULL); } else if (!PyArray_CheckExact(values)) { // Didn't get a numpy array, so keep trying PRINTMARK(); @@ -289,7 +279,7 @@ static PyObject *get_sub_attr(PyObject *obj, char *attr, char *subAttr) { } static int is_simple_frame(PyObject *obj) { - PyObject *check = get_sub_attr(obj, "_data", "is_mixed_type"); + PyObject *check = get_sub_attr(obj, "_mgr", "is_mixed_type"); int ret = (check == Py_False); if (!check) { @@ -366,6 +356,12 @@ static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), return int64ToIso(GET_TC(tc)->longValue, base, len); } +/* JSON callback. returns a char* and mutates the pointer to *len */ +static char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused), + JSONTypeContext *tc, size_t *len) { + return int64ToIsoDuration(GET_TC(tc)->longValue, len); +} + /* JSON callback */ static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, size_t *len) { @@ -764,7 +760,7 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { goto BLKRET; } - blocks = get_sub_attr(obj, "_data", "blocks"); + blocks = get_sub_attr(obj, "_mgr", "blocks"); if (!blocks) { GET_TC(tc)->iterNext = NpyArr_iterNextNone; goto BLKRET; @@ -780,7 +776,7 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { goto BLKRET; } - tmp = get_values(block); + tmp = PyObject_CallMethod(block, "get_block_values_for_json", NULL); if (!tmp) { ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; Py_DECREF(block); @@ -1266,7 +1262,7 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { } else if (index == 2) { memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); if (is_simple_frame(obj)) { - GET_TC(tc)->itemValue = get_values(obj); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "values"); if (!GET_TC(tc)->itemValue) { return 0; } @@ -1454,7 +1450,8 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, 1000000000LL; // nanoseconds per second } else { // datetime.* objects don't follow above rules - nanosecVal = PyDateTimeToEpoch(item, NPY_FR_ns); + nanosecVal = + PyDateTimeToEpoch(item, NPY_FR_ns); } } } @@ -1466,37 +1463,13 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, strncpy(cLabel, "null", len); } else { if (enc->datetimeIso) { - // TODO: Vectorized Timedelta function if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) { - PyObject *td = - PyObject_CallFunction(cls_timedelta, "(O)", item); - if (td == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - PyObject *iso = - PyObject_CallMethod(td, "isoformat", NULL); - Py_DECREF(td); - if (iso == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - len = strlen(PyUnicode_AsUTF8(iso)); - cLabel = PyObject_Malloc(len + 1); - memcpy(cLabel, PyUnicode_AsUTF8(iso), len + 1); - Py_DECREF(iso); + cLabel = int64ToIsoDuration(nanosecVal, &len); } else { if (type_num == NPY_DATETIME) { cLabel = int64ToIso(nanosecVal, base, &len); } else { - cLabel = PyDateTimeToIso((PyDateTime_Date *)item, - base, &len); + cLabel = PyDateTimeToIso(item, base, &len); } } if (cLabel == NULL) { @@ -1623,7 +1596,11 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (enc->datetimeIso) { PRINTMARK(); - pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; + if (enc->npyType == NPY_TIMEDELTA) { + pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; + } else { + pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; + } // Currently no way to pass longVal to iso function, so use // state management GET_TC(tc)->longValue = longVal; @@ -1704,7 +1681,8 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { PRINTMARK(); NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - GET_TC(tc)->longValue = PyDateTimeToEpoch((PyDateTime_Date *)obj, base); + GET_TC(tc)->longValue = + PyDateTimeToEpoch(obj, base); tc->type = JT_LONG; } return; @@ -1730,7 +1708,8 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { PRINTMARK(); NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - GET_TC(tc)->longValue = PyDateTimeToEpoch((PyDateTime_Date *)obj, base); + GET_TC(tc)->longValue = + PyDateTimeToEpoch(obj, base); tc->type = JT_LONG; } return; @@ -1743,28 +1722,30 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { value = total_seconds(obj) * 1000000000LL; // nanoseconds per second } - unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - if (scaleNanosecToUnit(&value, unit) != 0) { - // TODO: Add some kind of error handling here - } - - exc = PyErr_Occurred(); - - if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { - PRINTMARK(); - goto INVALID; - } - + PRINTMARK(); if (value == get_nat()) { PRINTMARK(); tc->type = JT_NULL; return; - } + } else if (enc->datetimeIso) { + pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; + tc->type = JT_UTF8; + } else { + unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + if (scaleNanosecToUnit(&value, unit) != 0) { + // TODO: Add some kind of error handling here + } - GET_TC(tc)->longValue = value; + exc = PyErr_Occurred(); - PRINTMARK(); - tc->type = JT_LONG; + if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { + PRINTMARK(); + goto INVALID; + } + + tc->type = JT_LONG; + } + GET_TC(tc)->longValue = value; return; } else if (PyArray_IsScalar(obj, Integer)) { PRINTMARK(); @@ -1935,7 +1916,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->iterNext = NpyArr_iterNext; pc->iterGetName = NpyArr_iterGetName; - pc->newObj = get_values(obj); + pc->newObj = PyObject_GetAttrString(obj, "values"); if (!pc->newObj) { goto INVALID; } diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index 0e57b563d4d25..c6b8c3e876390 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -65,7 +65,7 @@ cpdef assert_dict_equal(a, b, bint compare_keys=True): cpdef assert_almost_equal(a, b, check_less_precise=False, bint check_dtype=True, - obj=None, lobj=None, robj=None): + obj=None, lobj=None, robj=None, index_values=None): """ Check that left and right objects are almost equal. @@ -89,6 +89,12 @@ cpdef assert_almost_equal(a, b, robj : str, default None Specify right object name being compared, internally used to show appropriate assertion message + index_values : ndarray, default None + Specify shared index values of objects being compared, internally used + to show appropriate assertion message + + .. versionadded:: 1.1.0 + """ cdef: int decimal @@ -171,7 +177,7 @@ cpdef assert_almost_equal(a, b, from pandas._testing import raise_assert_detail msg = (f"{obj} values are different " f"({np.round(diff * 100.0 / na, 5)} %)") - raise_assert_detail(obj, msg, lobj, robj) + raise_assert_detail(obj, msg, lobj, robj, index_values=index_values) return True diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index a176c4e41e834..53bcf5be2586a 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -1,27 +1,44 @@ import cython -from cpython.datetime cimport (PyDateTime_Check, PyDate_Check, - PyDateTime_IMPORT, - timedelta, datetime, date, time) +from cpython.datetime cimport ( + PyDate_Check, + PyDateTime_Check, + PyDateTime_IMPORT, + date, + datetime, + time, + timedelta, +) # import datetime C API PyDateTime_IMPORT cimport numpy as cnp -from numpy cimport int64_t, ndarray, float64_t +from numpy cimport float64_t, int64_t, ndarray, uint8_t import numpy as np cnp.import_array() import pytz from pandas._libs.util cimport ( - is_integer_object, is_float_object, is_datetime64_object) + is_datetime64_object, + is_float_object, + is_integer_object, +) from pandas._libs.tslibs.c_timestamp cimport _Timestamp from pandas._libs.tslibs.np_datetime cimport ( - check_dts_bounds, npy_datetimestruct, _string_to_dts, dt64_to_dtstruct, - dtstruct_to_dt64, pydatetime_to_dt64, pydate_to_dt64, get_datetime64_value) + _string_to_dts, + check_dts_bounds, + dt64_to_dtstruct, + dtstruct_to_dt64, + get_datetime64_value, + npy_datetimestruct, + pydate_to_dt64, + pydatetime_to_dt64, +) + from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas._libs.tslibs.parsing import parse_datetime_string @@ -44,45 +61,71 @@ from pandas._libs.tslibs.timestamps cimport create_timestamp_from_ts from pandas._libs.tslibs.timestamps import Timestamp from pandas._libs.tslibs.tzconversion cimport ( - tz_convert_single, tz_convert_utc_to_tzlocal) + tz_convert_single, + tz_convert_utc_to_tzlocal, +) cdef inline object create_datetime_from_ts( - int64_t value, npy_datetimestruct dts, - object tz, object freq, bint fold): - """ convenience routine to construct a datetime.datetime from its parts """ - return datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz, fold=fold) + int64_t value, + npy_datetimestruct dts, + object tz, + object freq, + bint fold +): + """ + Convenience routine to construct a datetime.datetime from its parts. + """ + return datetime( + dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz, fold=fold + ) cdef inline object create_date_from_ts( - int64_t value, npy_datetimestruct dts, - object tz, object freq, bint fold): - """ convenience routine to construct a datetime.date from its parts """ + int64_t value, + npy_datetimestruct dts, + object tz, + object freq, + bint fold +): + """ + Convenience routine to construct a datetime.date from its parts. + """ # GH 25057 add fold argument to match other func_create signatures return date(dts.year, dts.month, dts.day) cdef inline object create_time_from_ts( - int64_t value, npy_datetimestruct dts, - object tz, object freq, bint fold): - """ convenience routine to construct a datetime.time from its parts """ + int64_t value, + npy_datetimestruct dts, + object tz, + object freq, + bint fold +): + """ + Convenience routine to construct a datetime.time from its parts. + """ return time(dts.hour, dts.min, dts.sec, dts.us, tz, fold=fold) @cython.wraparound(False) @cython.boundscheck(False) -def ints_to_pydatetime(const int64_t[:] arr, object tz=None, object freq=None, - bint fold=0, str box="datetime"): +def ints_to_pydatetime( + const int64_t[:] arr, + object tz=None, + object freq=None, + bint fold=False, + str box="datetime" +): """ - Convert an i8 repr to an ndarray of datetimes, date, time or Timestamp + Convert an i8 repr to an ndarray of datetimes, date, time or Timestamp. Parameters ---------- - arr : array of i8 - tz : str, default None + arr : array of i8 + tz : str, optional convert to this timezone - freq : str/Offset, default None + freq : str/Offset, optional freq to convert fold : bint, default is 0 Due to daylight saving time, one wall clock time can occur twice @@ -91,17 +134,16 @@ def ints_to_pydatetime(const int64_t[:] arr, object tz=None, object freq=None, the wall clock hits the ambiguous time .. versionadded:: 1.1.0 - box : {'datetime', 'timestamp', 'date', 'time'}, default 'datetime' - If datetime, convert to datetime.datetime - If date, convert to datetime.date - If time, convert to datetime.time - If Timestamp, convert to pandas.Timestamp + box : {'datetime', 'timestamp', 'date', 'time'}, default 'datetime' + * If datetime, convert to datetime.datetime + * If date, convert to datetime.date + * If time, convert to datetime.time + * If Timestamp, convert to pandas.Timestamp Returns ------- - result : array of dtype specified by box + ndarray of dtype specified by box """ - cdef: Py_ssize_t i, n = len(arr) ndarray[int64_t] trans @@ -224,8 +266,12 @@ def _test_parse_iso8601(ts: str): @cython.wraparound(False) @cython.boundscheck(False) -def format_array_from_datetime(ndarray[int64_t] values, object tz=None, - object format=None, object na_rep=None): +def format_array_from_datetime( + ndarray[int64_t] values, + object tz=None, + object format=None, + object na_rep=None +): """ return a np object array of the string formatted values @@ -242,7 +288,8 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, cdef: int64_t val, ns, N = len(values) ndarray[int64_t] consider_values - bint show_ms = 0, show_us = 0, show_ns = 0, basic_format = 0 + bint show_ms = False, show_us = False, show_ns = False + bint basic_format = False ndarray[object] result = np.empty(N, dtype=object) object ts, res npy_datetimestruct dts @@ -303,8 +350,11 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, return result -def array_with_unit_to_datetime(ndarray values, ndarray mask, object unit, - str errors='coerce'): +def array_with_unit_to_datetime( + ndarray values, + object unit, + str errors='coerce' +): """ Convert the ndarray to datetime according to the time unit. @@ -322,14 +372,11 @@ def array_with_unit_to_datetime(ndarray values, ndarray mask, object unit, Parameters ---------- values : ndarray of object - Date-like objects to convert - mask : ndarray of bool - Not-a-time mask for non-nullable integer types conversion, - can be None + Date-like objects to convert. unit : object - Time unit to use during conversion + Time unit to use during conversion. errors : str, default 'raise' - Error behavior when parsing + Error behavior when parsing. Returns ------- @@ -346,6 +393,7 @@ def array_with_unit_to_datetime(ndarray values, ndarray mask, object unit, bint need_to_iterate = True ndarray[int64_t] iresult ndarray[object] oresult + ndarray mask object tz = None assert is_ignore or is_coerce or is_raise @@ -355,9 +403,6 @@ def array_with_unit_to_datetime(ndarray values, ndarray mask, object unit, result = values.astype('M8[ns]') else: result, tz = array_to_datetime(values.astype(object), errors=errors) - if mask is not None: - iresult = result.view('i8') - iresult[mask] = NPY_NAT return result, tz m = cast_from_unit(None, unit) @@ -370,9 +415,8 @@ def array_with_unit_to_datetime(ndarray values, ndarray mask, object unit, if values.dtype.kind == "i": # Note: this condition makes the casting="same_kind" redundant iresult = values.astype('i8', casting='same_kind', copy=False) - # If no mask, fill mask by comparing to NPY_NAT constant - if mask is None: - mask = iresult == NPY_NAT + # fill by comparing to NPY_NAT constant + mask = iresult == NPY_NAT iresult[mask] = 0 fvalues = iresult.astype('f8') * m need_to_iterate = False @@ -382,8 +426,7 @@ def array_with_unit_to_datetime(ndarray values, ndarray mask, object unit, if ((fvalues < Timestamp.min.value).any() or (fvalues > Timestamp.max.value).any()): - raise OutOfBoundsDatetime(f"cannot convert input with unit " - f"'{unit}'") + raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'") result = (iresult * m).astype('M8[ns]') iresult = result.view('i8') iresult[mask] = NPY_NAT @@ -409,8 +452,8 @@ def array_with_unit_to_datetime(ndarray values, ndarray mask, object unit, except OverflowError: if is_raise: raise OutOfBoundsDatetime( - f"cannot convert input {val} with the unit " - f"'{unit}'") + f"cannot convert input {val} with the unit '{unit}'" + ) elif is_ignore: raise AssertionError iresult[i] = NPY_NAT @@ -425,16 +468,16 @@ def array_with_unit_to_datetime(ndarray values, ndarray mask, object unit, except ValueError: if is_raise: raise ValueError( - f"non convertible value {val} with the unit " - f"'{unit}'") + f"non convertible value {val} with the unit '{unit}'" + ) elif is_ignore: raise AssertionError iresult[i] = NPY_NAT except OverflowError: if is_raise: raise OutOfBoundsDatetime( - f"cannot convert input {val} with the unit " - f"'{unit}'") + f"cannot convert input {val} with the unit '{unit}'" + ) elif is_ignore: raise AssertionError iresult[i] = NPY_NAT @@ -442,8 +485,9 @@ def array_with_unit_to_datetime(ndarray values, ndarray mask, object unit, else: if is_raise: - raise ValueError(f"unit='{unit}' not valid with non-numerical " - f"val='{val}'") + raise ValueError( + f"unit='{unit}' not valid with non-numerical val='{val}'" + ) if is_ignore: raise AssertionError @@ -486,9 +530,14 @@ def array_with_unit_to_datetime(ndarray values, ndarray mask, object unit, @cython.wraparound(False) @cython.boundscheck(False) -cpdef array_to_datetime(ndarray[object] values, str errors='raise', - bint dayfirst=False, bint yearfirst=False, - object utc=None, bint require_iso8601=False): +cpdef array_to_datetime( + ndarray[object] values, + str errors='raise', + bint dayfirst=False, + bint yearfirst=False, + object utc=None, + bint require_iso8601=False +): """ Converts a 1D array of date-like values to a numpy array of either: 1) datetime64[ns] data @@ -528,10 +577,10 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', ndarray[object] oresult npy_datetimestruct dts bint utc_convert = bool(utc) - bint seen_integer = 0 - bint seen_string = 0 - bint seen_datetime = 0 - bint seen_datetime_offset = 0 + bint seen_integer = False + bint seen_string = False + bint seen_datetime = False + bint seen_datetime_offset = False bint is_raise = errors=='raise' bint is_ignore = errors=='ignore' bint is_coerce = errors=='coerce' @@ -558,7 +607,7 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', iresult[i] = NPY_NAT elif PyDateTime_Check(val): - seen_datetime = 1 + seen_datetime = True if val.tzinfo is not None: if utc_convert: _ts = convert_datetime_to_tsobject(val, None) @@ -574,17 +623,17 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', check_dts_bounds(&dts) elif PyDate_Check(val): - seen_datetime = 1 + seen_datetime = True iresult[i] = pydate_to_dt64(val, &dts) check_dts_bounds(&dts) elif is_datetime64_object(val): - seen_datetime = 1 + seen_datetime = True iresult[i] = get_datetime64_nanos(val) elif is_integer_object(val) or is_float_object(val): # these must be ns unit by-definition - seen_integer = 1 + seen_integer = True if val != val or val == NPY_NAT: iresult[i] = NPY_NAT @@ -603,7 +652,7 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', elif isinstance(val, str): # string - seen_string = 1 + seen_string = True if len(val) == 0 or val in nat_strings: iresult[i] = NPY_NAT @@ -625,8 +674,9 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', iresult[i] = NPY_NAT continue elif is_raise: - raise ValueError(f"time data {val} doesn't " - f"match format specified") + raise ValueError( + f"time data {val} doesn't match format specified" + ) return values, tz_out try: @@ -641,11 +691,10 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', if is_coerce: iresult[i] = NPY_NAT continue - raise TypeError("invalid string coercion to " - "datetime") + raise TypeError("invalid string coercion to datetime") if tz is not None: - seen_datetime_offset = 1 + seen_datetime_offset = True # dateutil timezone objects cannot be hashed, so # store the UTC offsets in seconds instead out_tzoffset_vals.add(tz.total_seconds()) @@ -661,7 +710,7 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', # where we left off value = dtstruct_to_dt64(&dts) if out_local == 1: - seen_datetime_offset = 1 + seen_datetime_offset = True # Store the out_tzoffset in seconds # since we store the total_seconds of # dateutil.tz.tzoffset objects @@ -708,8 +757,7 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', return ignore_errors_out_of_bounds_fallback(values), tz_out except TypeError: - return array_to_datetime_object(values, errors, - dayfirst, yearfirst) + return array_to_datetime_object(values, errors, dayfirst, yearfirst) if seen_datetime and seen_integer: # we have mixed datetimes & integers @@ -724,8 +772,7 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', elif is_raise: raise ValueError("mixed datetimes and integers in passed array") else: - return array_to_datetime_object(values, errors, - dayfirst, yearfirst) + return array_to_datetime_object(values, errors, dayfirst, yearfirst) if seen_datetime_offset and not utc_convert: # GH#17697 @@ -736,8 +783,7 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', # (with individual dateutil.tzoffsets) are returned is_same_offsets = len(out_tzoffset_vals) == 1 if not is_same_offsets: - return array_to_datetime_object(values, errors, - dayfirst, yearfirst) + return array_to_datetime_object(values, errors, dayfirst, yearfirst) else: tz_offset = out_tzoffset_vals.pop() tz_out = pytz.FixedOffset(tz_offset / 60.) @@ -784,8 +830,12 @@ cdef ignore_errors_out_of_bounds_fallback(ndarray[object] values): @cython.wraparound(False) @cython.boundscheck(False) -cdef array_to_datetime_object(ndarray[object] values, str errors, - bint dayfirst=False, bint yearfirst=False): +cdef array_to_datetime_object( + ndarray[object] values, + str errors, + bint dayfirst=False, + bint yearfirst=False +): """ Fall back function for array_to_datetime diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index 8d3b00e4a44b9..4a4e53eaa45fa 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -1,4 +1,21 @@ -# flake8: noqa +__all__ = [ + "localize_pydatetime", + "normalize_date", + "NaT", + "NaTType", + "iNaT", + "is_null_datetimelike", + "OutOfBoundsDatetime", + "IncompatibleFrequency", + "Period", + "Timedelta", + "delta_to_nanoseconds", + "ints_to_pytimedelta", + "Timestamp", + "tz_convert_single", + "NullFrequencyError", +] + from .conversion import localize_pydatetime, normalize_date from .nattype import NaT, NaTType, iNaT, is_null_datetimelike diff --git a/pandas/_libs/tslibs/c_timestamp.pxd b/pandas/_libs/tslibs/c_timestamp.pxd index e41197d0f20a2..d095b6027d2f9 100644 --- a/pandas/_libs/tslibs/c_timestamp.pxd +++ b/pandas/_libs/tslibs/c_timestamp.pxd @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - from cpython.datetime cimport datetime from numpy cimport int64_t diff --git a/pandas/_libs/tslibs/c_timestamp.pyx b/pandas/_libs/tslibs/c_timestamp.pyx index 2c72cec18f096..68987030e8b4e 100644 --- a/pandas/_libs/tslibs/c_timestamp.pyx +++ b/pandas/_libs/tslibs/c_timestamp.pyx @@ -114,6 +114,18 @@ cdef class _Timestamp(datetime): return NotImplemented elif is_array(other): # avoid recursion error GH#15183 + if other.dtype.kind == "M": + if self.tz is None: + return PyObject_RichCompare(self.asm8, other, op) + raise TypeError( + "Cannot compare tz-naive and tz-aware timestamps" + ) + if other.dtype.kind == "O": + # Operate element-wise + return np.array( + [PyObject_RichCompare(self, x, op) for x in other], + dtype=bool, + ) return PyObject_RichCompare(np.array([self]), other, op) return PyObject_RichCompare(other, self, reverse_ops[op]) else: @@ -253,6 +265,13 @@ cdef class _Timestamp(datetime): elif is_array(other): if other.dtype.kind in ['i', 'u']: raise integer_op_not_supported(self) + if other.dtype.kind == "m": + if self.tz is None: + return self.asm8 + other + return np.asarray( + [self + other[n] for n in range(len(other))], + dtype=object, + ) # index/series like elif hasattr(other, '_typ'): @@ -275,6 +294,13 @@ cdef class _Timestamp(datetime): elif is_array(other): if other.dtype.kind in ['i', 'u']: raise integer_op_not_supported(self) + if other.dtype.kind == "m": + if self.tz is None: + return self.asm8 - other + return np.asarray( + [self - other[n] for n in range(len(other))], + dtype=object, + ) typ = getattr(other, '_typ', None) if typ is not None: @@ -286,6 +312,10 @@ cdef class _Timestamp(datetime): # coerce if necessary if we are a Timestamp-like if (PyDateTime_Check(self) and (PyDateTime_Check(other) or is_datetime64_object(other))): + # both_timestamps is to determine whether Timedelta(self - other) + # should raise the OOB error, or fall back returning a timedelta. + both_timestamps = (isinstance(other, _Timestamp) and + isinstance(self, _Timestamp)) if isinstance(self, _Timestamp): other = type(self)(other) else: @@ -301,7 +331,14 @@ cdef class _Timestamp(datetime): from pandas._libs.tslibs.timedeltas import Timedelta try: return Timedelta(self.value - other.value) - except (OverflowError, OutOfBoundsDatetime): + except (OverflowError, OutOfBoundsDatetime) as err: + if isinstance(other, _Timestamp): + if both_timestamps: + raise OutOfBoundsDatetime( + "Result is too large for pandas.Timedelta. Convert inputs " + "to datetime.datetime with 'Timestamp.to_pydatetime()' " + "before subtracting." + ) from err pass elif is_datetime64_object(self): # GH#28286 cython semantics for __rsub__, `other` is actually diff --git a/pandas/_libs/tslibs/ccalendar.pxd b/pandas/_libs/tslibs/ccalendar.pxd index 08f539a70a7ed..68ad1d1e68133 100644 --- a/pandas/_libs/tslibs/ccalendar.pxd +++ b/pandas/_libs/tslibs/ccalendar.pxd @@ -1,12 +1,12 @@ -# -*- coding: utf-8 -*- - from cython cimport Py_ssize_t from numpy cimport int64_t, int32_t +ctypedef (int32_t, int32_t, int32_t) iso_calendar_t cdef int dayofweek(int y, int m, int d) nogil cdef bint is_leapyear(int64_t year) nogil cpdef int32_t get_days_in_month(int year, Py_ssize_t month) nogil cpdef int32_t get_week_of_year(int year, int month, int day) nogil +cpdef iso_calendar_t get_iso_calendar(int year, int month, int day) nogil cpdef int32_t get_day_of_year(int year, int month, int day) nogil diff --git a/pandas/_libs/tslibs/ccalendar.pyx b/pandas/_libs/tslibs/ccalendar.pyx index 0588dfe20e2e2..0873084d29555 100644 --- a/pandas/_libs/tslibs/ccalendar.pyx +++ b/pandas/_libs/tslibs/ccalendar.pyx @@ -150,33 +150,65 @@ cpdef int32_t get_week_of_year(int year, int month, int day) nogil: ------- week_of_year : int32_t + Notes + ----- + Assumes the inputs describe a valid date. + """ + return get_iso_calendar(year, month, day)[1] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef iso_calendar_t get_iso_calendar(int year, int month, int day) nogil: + """ + Return the year, week, and day of year corresponding to ISO 8601 + + Parameters + ---------- + year : int + month : int + day : int + + Returns + ------- + year : int32_t + week : int32_t + day : int32_t + Notes ----- Assumes the inputs describe a valid date. """ cdef: int32_t doy, dow - int woy + int32_t iso_year, iso_week doy = get_day_of_year(year, month, day) dow = dayofweek(year, month, day) # estimate - woy = (doy - 1) - dow + 3 - if woy >= 0: - woy = woy // 7 + 1 + iso_week = (doy - 1) - dow + 3 + if iso_week >= 0: + iso_week = iso_week // 7 + 1 # verify - if woy < 0: - if (woy > -2) or (woy == -2 and is_leapyear(year - 1)): - woy = 53 + if iso_week < 0: + if (iso_week > -2) or (iso_week == -2 and is_leapyear(year - 1)): + iso_week = 53 else: - woy = 52 - elif woy == 53: + iso_week = 52 + elif iso_week == 53: if 31 - day + dow < 3: - woy = 1 + iso_week = 1 + + iso_year = year + if iso_week == 1 and doy > 7: + iso_year += 1 + + elif iso_week >= 52 and doy < 7: + iso_year -= 1 - return woy + return iso_year, iso_week, dow + 1 @cython.wraparound(False) diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index bb20296e24587..e5b2a37860068 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - from cpython.datetime cimport datetime from numpy cimport int64_t, int32_t diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 57483783faf9f..5a8d0a0ec1670 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -8,8 +8,7 @@ cnp.import_array() import pytz # stdlib datetime imports -from datetime import time as datetime_time -from cpython.datetime cimport (datetime, tzinfo, +from cpython.datetime cimport (datetime, time, tzinfo, PyDateTime_Check, PyDate_Check, PyDateTime_IMPORT) PyDateTime_IMPORT @@ -45,8 +44,8 @@ from pandas._libs.tslibs.tzconversion cimport ( # ---------------------------------------------------------------------- # Constants -NS_DTYPE = np.dtype('M8[ns]') -TD_DTYPE = np.dtype('m8[ns]') +DT64NS_DTYPE = np.dtype('M8[ns]') +TD64NS_DTYPE = np.dtype('m8[ns]') # ---------------------------------------------------------------------- @@ -106,11 +105,11 @@ def ensure_datetime64ns(arr: ndarray, copy: bool=True): ivalues = arr.view(np.int64).ravel() - result = np.empty(shape, dtype=NS_DTYPE) + result = np.empty(shape, dtype=DT64NS_DTYPE) iresult = result.ravel().view(np.int64) if len(iresult) == 0: - result = arr.view(NS_DTYPE) + result = arr.view(DT64NS_DTYPE) if copy: result = result.copy() return result @@ -146,7 +145,7 @@ def ensure_timedelta64ns(arr: ndarray, copy: bool=True): result : ndarray with dtype timedelta64[ns] """ - return arr.astype(TD_DTYPE, copy=copy) + return arr.astype(TD64NS_DTYPE, copy=copy) # TODO: check for overflows when going from a lower-resolution to nanos @@ -284,7 +283,7 @@ cdef convert_to_tsobject(object ts, object tz, object unit, return convert_datetime_to_tsobject(ts, tz, nanos) elif PyDate_Check(ts): # Keep the converter same as PyDateTime's - ts = datetime.combine(ts, datetime_time()) + ts = datetime.combine(ts, time()) return convert_datetime_to_tsobject(ts, tz) elif getattr(ts, '_typ', None) == 'period': raise ValueError("Cannot convert Period to Timestamp " @@ -595,8 +594,12 @@ cdef inline void localize_tso(_TSObject obj, tzinfo tz): obj.tzinfo = tz -cdef inline bint _infer_tsobject_fold(_TSObject obj, ndarray[int64_t] trans, - int64_t[:] deltas, int32_t pos): +cdef inline bint _infer_tsobject_fold( + _TSObject obj, + const int64_t[:] trans, + const int64_t[:] deltas, + int32_t pos, +): """ Infer _TSObject fold property from value by assuming 0 and then setting to 1 if necessary. @@ -738,7 +741,7 @@ def normalize_i8_timestamps(int64_t[:] stamps, object tz): @cython.wraparound(False) @cython.boundscheck(False) -cdef int64_t[:] _normalize_local(int64_t[:] stamps, tzinfo tz): +cdef int64_t[:] _normalize_local(const int64_t[:] stamps, tzinfo tz): """ Normalize each of the (nanosecond) timestamps in the given array by rounding down to the beginning of the day (i.e. midnight) for the @@ -818,7 +821,7 @@ cdef inline int64_t _normalized_stamp(npy_datetimestruct *dts) nogil: @cython.wraparound(False) @cython.boundscheck(False) -def is_date_array_normalized(int64_t[:] stamps, object tz=None): +def is_date_array_normalized(const int64_t[:] stamps, object tz=None): """ Check if all of the given (nanosecond) timestamps are normalized to midnight, i.e. hour == minute == second == 0. If the optional timezone diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 8bee7da6231ba..184d368659714 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -8,14 +8,14 @@ from cython import Py_ssize_t import numpy as np cimport numpy as cnp -from numpy cimport ndarray, int64_t, int32_t, int8_t +from numpy cimport ndarray, int64_t, int32_t, int8_t, uint32_t cnp.import_array() from pandas._libs.tslibs.ccalendar import ( get_locale_names, MONTHS_FULL, DAYS_FULL, DAY_SECONDS) from pandas._libs.tslibs.ccalendar cimport ( get_days_in_month, is_leapyear, dayofweek, get_week_of_year, - get_day_of_year) + get_day_of_year, get_iso_calendar, iso_calendar_t) from pandas._libs.tslibs.np_datetime cimport ( npy_datetimestruct, pandas_timedeltastruct, dt64_to_dtstruct, td64_to_tdstruct) @@ -38,7 +38,7 @@ def get_time_micros(const int64_t[:] dtindex): cdef: ndarray[int64_t] micros - micros = np.mod(dtindex, DAY_SECONDS * 1000000000, dtype=np.int64) + micros = np.mod(dtindex, DAY_SECONDS * 1_000_000_000, dtype=np.int64) micros //= 1000 return micros @@ -54,13 +54,15 @@ def build_field_sarray(const int64_t[:] dtindex): npy_datetimestruct dts ndarray[int32_t] years, months, days, hours, minutes, seconds, mus - sa_dtype = [('Y', 'i4'), # year - ('M', 'i4'), # month - ('D', 'i4'), # day - ('h', 'i4'), # hour - ('m', 'i4'), # min - ('s', 'i4'), # second - ('u', 'i4')] # microsecond + sa_dtype = [ + ("Y", "i4"), # year + ("M", "i4"), # month + ("D", "i4"), # day + ("h", "i4"), # hour + ("m", "i4"), # min + ("s", "i4"), # second + ("u", "i4"), # microsecond + ] out = np.empty(count, dtype=sa_dtype) @@ -157,9 +159,12 @@ def get_start_end_field(const int64_t[:] dtindex, object field, int mo_off, dom, doy, dow, ldom _month_offset = np.array( - [[0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365], - [0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366]], - dtype=np.int32) + [ + [0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365], + [0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366], + ], + dtype=np.int32, + ) out = np.zeros(count, dtype='int8') @@ -665,3 +670,42 @@ cpdef isleapyear_arr(ndarray years): np.logical_and(years % 4 == 0, years % 100 > 0))] = 1 return out.view(bool) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def build_isocalendar_sarray(const int64_t[:] dtindex): + """ + Given a int64-based datetime array, return the ISO 8601 year, week, and day + as a structured array. + """ + cdef: + Py_ssize_t i, count = len(dtindex) + npy_datetimestruct dts + ndarray[uint32_t] iso_years, iso_weeks, days + iso_calendar_t ret_val + + sa_dtype = [ + ("year", "u4"), + ("week", "u4"), + ("day", "u4"), + ] + + out = np.empty(count, dtype=sa_dtype) + + iso_years = out["year"] + iso_weeks = out["week"] + days = out["day"] + + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: + ret_val = 0, 0, 0 + else: + dt64_to_dtstruct(dtindex[i], &dts) + ret_val = get_iso_calendar(dts.year, dts.month, dts.day) + + iso_years[i] = ret_val[0] + iso_weeks[i] = ret_val[1] + days[i] = ret_val[2] + return out diff --git a/pandas/_libs/tslibs/frequencies.pxd b/pandas/_libs/tslibs/frequencies.pxd index 6ec67ce250505..1b7efb8c5dfdf 100644 --- a/pandas/_libs/tslibs/frequencies.pxd +++ b/pandas/_libs/tslibs/frequencies.pxd @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - cpdef str get_rule_month(object source, str default=*) cpdef get_freq_code(freqstr) diff --git a/pandas/_libs/tslibs/nattype.pxd b/pandas/_libs/tslibs/nattype.pxd index dae5bdc3f93b1..bd97462381b58 100644 --- a/pandas/_libs/tslibs/nattype.pxd +++ b/pandas/_libs/tslibs/nattype.pxd @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - from cpython.datetime cimport datetime from numpy cimport int64_t diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 68a25d0cc481a..ec397a470f2ec 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -1,10 +1,20 @@ from cpython.object cimport ( + Py_EQ, + Py_GE, + Py_GT, + Py_LE, + Py_LT, + Py_NE, PyObject_RichCompare, - Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE) +) -from cpython.datetime cimport (datetime, timedelta, - PyDateTime_Check, PyDelta_Check, - PyDateTime_IMPORT) +from cpython.datetime cimport ( + PyDateTime_Check, + PyDateTime_IMPORT, + PyDelta_Check, + datetime, + timedelta, +) from cpython.version cimport PY_MINOR_VERSION @@ -16,20 +26,19 @@ from numpy cimport int64_t cnp.import_array() from pandas._libs.tslibs.np_datetime cimport ( - get_datetime64_value, get_timedelta64_value) + get_datetime64_value, + get_timedelta64_value, +) cimport pandas._libs.tslibs.util as util -from pandas._libs.tslibs.util cimport ( - get_nat, is_integer_object, is_float_object, is_datetime64_object, - is_timedelta64_object) from pandas._libs.missing cimport C_NA # ---------------------------------------------------------------------- # Constants -nat_strings = {'NaT', 'nat', 'NAT', 'nan', 'NaN', 'NAN'} +nat_strings = {"NaT", "nat", "NAT", "nan", "NaN", "NAN"} -cdef int64_t NPY_NAT = get_nat() +cdef int64_t NPY_NAT = util.get_nat() iNaT = NPY_NAT # python-visible constant cdef bint _nat_scalar_rules[6] @@ -61,7 +70,7 @@ def _make_nat_func(func_name, doc): def _make_error_func(func_name, cls): def f(*args, **kwargs): - raise ValueError("NaTType does not support " + func_name) + raise ValueError(f"NaTType does not support {func_name}") f.__name__ = func_name if isinstance(cls, str): @@ -73,9 +82,9 @@ def _make_error_func(func_name, cls): cdef _nat_divide_op(self, other): - if PyDelta_Check(other) or is_timedelta64_object(other) or other is c_NaT: + if PyDelta_Check(other) or util.is_timedelta64_object(other) or other is c_NaT: return np.nan - if is_integer_object(other) or is_float_object(other): + if util.is_integer_object(other) or util.is_float_object(other): return c_NaT return NotImplemented @@ -103,7 +112,7 @@ cdef class _NaT(datetime): def __richcmp__(_NaT self, object other, int op): cdef: - int ndim = getattr(other, 'ndim', -1) + int ndim = getattr(other, "ndim", -1) if ndim == -1: return _nat_scalar_rules[op] @@ -114,11 +123,13 @@ cdef class _NaT(datetime): return result elif ndim == 0: - if is_datetime64_object(other): + if util.is_datetime64_object(other): return _nat_scalar_rules[op] else: - raise TypeError(f'Cannot compare type {type(self).__name__} ' - f'with type {type(other).__name__}') + raise TypeError( + f"Cannot compare type {type(self).__name__} " + f"with type {type(other).__name__}" + ) # Note: instead of passing "other, self, _reverse_ops[op]", we observe # that `_nat_scalar_rules` is invariant under `_reverse_ops`, @@ -134,19 +145,19 @@ cdef class _NaT(datetime): return c_NaT elif PyDelta_Check(other): return c_NaT - elif is_datetime64_object(other) or is_timedelta64_object(other): + elif util.is_datetime64_object(other) or util.is_timedelta64_object(other): return c_NaT - elif hasattr(other, 'delta'): + elif hasattr(other, "delta"): # Timedelta, offsets.Tick, offsets.Week return c_NaT - elif is_integer_object(other) or util.is_period_object(other): + elif util.is_integer_object(other) or util.is_period_object(other): # For Period compat # TODO: the integer behavior is deprecated, remove it return c_NaT elif util.is_array(other): - if other.dtype.kind in 'mM': + if other.dtype.kind in "mM": # If we are adding to datetime64, we treat NaT as timedelta # Either way, result dtype is datetime64 result = np.empty(other.shape, dtype="datetime64[ns]") @@ -171,19 +182,19 @@ cdef class _NaT(datetime): return c_NaT elif PyDelta_Check(other): return c_NaT - elif is_datetime64_object(other) or is_timedelta64_object(other): + elif util.is_datetime64_object(other) or util.is_timedelta64_object(other): return c_NaT - elif hasattr(other, 'delta'): + elif hasattr(other, "delta"): # offsets.Tick, offsets.Week return c_NaT - elif is_integer_object(other) or util.is_period_object(other): + elif util.is_integer_object(other) or util.is_period_object(other): # For Period compat # TODO: the integer behavior is deprecated, remove it return c_NaT elif util.is_array(other): - if other.dtype.kind == 'm': + if other.dtype.kind == "m": if not is_rsub: # NaT - timedelta64 we treat NaT as datetime64, so result # is datetime64 @@ -197,15 +208,16 @@ cdef class _NaT(datetime): result.fill("NaT") return result - elif other.dtype.kind == 'M': + elif other.dtype.kind == "M": # We treat NaT as a datetime, so regardless of whether this is # NaT - other or other - NaT, the result is timedelta64 result = np.empty(other.shape, dtype="timedelta64[ns]") result.fill("NaT") return result - raise TypeError(f"Cannot subtract NaT from ndarray with " - f"dtype {other.dtype}") + raise TypeError( + f"Cannot subtract NaT from ndarray with dtype {other.dtype}" + ) return NotImplemented @@ -225,19 +237,19 @@ cdef class _NaT(datetime): return _nat_divide_op(self, other) def __mul__(self, other): - if is_integer_object(other) or is_float_object(other): + if util.is_integer_object(other) or util.is_float_object(other): return NaT return NotImplemented @property def asm8(self) -> np.datetime64: - return np.datetime64(NPY_NAT, 'ns') + return np.datetime64(NPY_NAT, "ns") def to_datetime64(self) -> np.datetime64: """ Return a numpy.datetime64 object with 'ns' precision. """ - return np.datetime64('NaT', 'ns') + return np.datetime64('NaT', "ns") def to_numpy(self, dtype=None, copy=False) -> np.datetime64: """ @@ -260,14 +272,14 @@ cdef class _NaT(datetime): return self.to_datetime64() def __repr__(self) -> str: - return 'NaT' + return "NaT" def __str__(self) -> str: - return 'NaT' + return "NaT" - def isoformat(self, sep='T') -> str: + def isoformat(self, sep="T") -> str: # This allows Timestamp(ts.isoformat()) to always correctly roundtrip. - return 'NaT' + return "NaT" def __hash__(self): return NPY_NAT @@ -308,7 +320,9 @@ cdef class _NaT(datetime): class NaTType(_NaT): - """(N)ot-(A)-(T)ime, the time equivalent of NaN""" + """ + (N)ot-(A)-(T)ime, the time equivalent of NaN. + """ def __new__(cls): cdef _NaT base @@ -338,7 +352,7 @@ class NaTType(_NaT): return _nat_rdivide_op(self, other) def __rmul__(self, other): - if is_integer_object(other) or is_float_object(other): + if util.is_integer_object(other) or util.is_float_object(other): return c_NaT return NotImplemented @@ -379,10 +393,11 @@ class NaTType(_NaT): # These are the ones that can get their docstrings from datetime. # nan methods - weekday = _make_nan_func('weekday', datetime.weekday.__doc__) - isoweekday = _make_nan_func('isoweekday', datetime.isoweekday.__doc__) - total_seconds = _make_nan_func('total_seconds', timedelta.total_seconds.__doc__) - month_name = _make_nan_func('month_name', # noqa:E128 + weekday = _make_nan_func("weekday", datetime.weekday.__doc__) + isoweekday = _make_nan_func("isoweekday", datetime.isoweekday.__doc__) + total_seconds = _make_nan_func("total_seconds", timedelta.total_seconds.__doc__) + month_name = _make_nan_func( + "month_name", """ Return the month name of the Timestamp with specified locale. @@ -396,8 +411,10 @@ class NaTType(_NaT): month_name : string .. versionadded:: 0.23.0 - """) - day_name = _make_nan_func('day_name', # noqa:E128 + """, + ) + day_name = _make_nan_func( + "day_name", """ Return the day name of the Timestamp with specified locale. @@ -411,73 +428,79 @@ class NaTType(_NaT): day_name : string .. versionadded:: 0.23.0 - """) + """, + ) # _nat_methods - date = _make_nat_func('date', datetime.date.__doc__) - - utctimetuple = _make_error_func('utctimetuple', datetime) - timetz = _make_error_func('timetz', datetime) - timetuple = _make_error_func('timetuple', datetime) - strftime = _make_error_func('strftime', datetime) - isocalendar = _make_error_func('isocalendar', datetime) - dst = _make_error_func('dst', datetime) - ctime = _make_error_func('ctime', datetime) - time = _make_error_func('time', datetime) - toordinal = _make_error_func('toordinal', datetime) - tzname = _make_error_func('tzname', datetime) - utcoffset = _make_error_func('utcoffset', datetime) + date = _make_nat_func("date", datetime.date.__doc__) + + utctimetuple = _make_error_func("utctimetuple", datetime) + timetz = _make_error_func("timetz", datetime) + timetuple = _make_error_func("timetuple", datetime) + strftime = _make_error_func("strftime", datetime) + isocalendar = _make_error_func("isocalendar", datetime) + dst = _make_error_func("dst", datetime) + ctime = _make_error_func("ctime", datetime) + time = _make_error_func("time", datetime) + toordinal = _make_error_func("toordinal", datetime) + tzname = _make_error_func("tzname", datetime) + utcoffset = _make_error_func("utcoffset", datetime) # "fromisocalendar" was introduced in 3.8 if PY_MINOR_VERSION >= 8: - fromisocalendar = _make_error_func('fromisocalendar', datetime) + fromisocalendar = _make_error_func("fromisocalendar", datetime) # ---------------------------------------------------------------------- # The remaining methods have docstrings copy/pasted from the analogous # Timestamp methods. - strptime = _make_error_func('strptime', # noqa:E128 + strptime = _make_error_func( + "strptime", """ Timestamp.strptime(string, format) Function is not implemented. Use pd.to_datetime(). - """ + """, ) - utcfromtimestamp = _make_error_func('utcfromtimestamp', # noqa:E128 + utcfromtimestamp = _make_error_func( + "utcfromtimestamp", """ Timestamp.utcfromtimestamp(ts) Construct a naive UTC datetime from a POSIX timestamp. - """ + """, ) - fromtimestamp = _make_error_func('fromtimestamp', # noqa:E128 + fromtimestamp = _make_error_func( + "fromtimestamp", """ Timestamp.fromtimestamp(ts) timestamp[, tz] -> tz's local time from POSIX timestamp. - """ + """, ) - combine = _make_error_func('combine', # noqa:E128 + combine = _make_error_func( + "combine", """ Timestamp.combine(date, time) date, time -> datetime with same date and time fields. - """ + """, ) - utcnow = _make_error_func('utcnow', # noqa:E128 + utcnow = _make_error_func( + "utcnow", """ Timestamp.utcnow() Return a new Timestamp representing UTC day and time. - """ + """, ) - timestamp = _make_error_func('timestamp', # noqa:E128 - """Return POSIX timestamp as float.""") + timestamp = _make_error_func("timestamp", "Return POSIX timestamp as float.") # GH9513 NaT methods (except to_datetime64) to raise, return np.nan, or # return NaT create functions that raise, for binding to NaTType - astimezone = _make_error_func('astimezone', # noqa:E128 + astimezone = _make_error_func( + "astimezone", """ Convert tz-aware Timestamp to another time zone. @@ -495,8 +518,10 @@ class NaTType(_NaT): ------ TypeError If Timestamp is tz-naive. - """) - fromordinal = _make_error_func('fromordinal', # noqa:E128 + """, + ) + fromordinal = _make_error_func( + "fromordinal", """ Timestamp.fromordinal(ordinal, freq=None, tz=None) @@ -511,17 +536,21 @@ class NaTType(_NaT): Offset to apply to the Timestamp. tz : str, pytz.timezone, dateutil.tz.tzfile or None Time zone for the Timestamp. - """) + """, + ) # _nat_methods - to_pydatetime = _make_nat_func('to_pydatetime', # noqa:E128 + to_pydatetime = _make_nat_func( + "to_pydatetime", """ Convert a Timestamp object to a native Python datetime object. If warn=True, issue a warning if nanoseconds is nonzero. - """) + """, + ) - now = _make_nat_func('now', # noqa:E128 + now = _make_nat_func( + "now", """ Timestamp.now(tz=None) @@ -532,8 +561,10 @@ class NaTType(_NaT): ---------- tz : str or timezone object, default None Timezone to localize to. - """) - today = _make_nat_func('today', # noqa:E128 + """, + ) + today = _make_nat_func( + "today", """ Timestamp.today(cls, tz=None) @@ -545,8 +576,10 @@ class NaTType(_NaT): ---------- tz : str or timezone object, default None Timezone to localize to. - """) - round = _make_nat_func('round', # noqa:E128 + """, + ) + round = _make_nat_func( + "round", """ Round the Timestamp to the specified resolution. @@ -586,8 +619,10 @@ timedelta}, default 'raise' Raises ------ ValueError if the freq cannot be converted - """) - floor = _make_nat_func('floor', # noqa:E128 + """, + ) + floor = _make_nat_func( + "floor", """ return a new Timestamp floored to this resolution. @@ -623,8 +658,10 @@ timedelta}, default 'raise' Raises ------ ValueError if the freq cannot be converted. - """) - ceil = _make_nat_func('ceil', # noqa:E128 + """, + ) + ceil = _make_nat_func( + "ceil", """ return a new Timestamp ceiled to this resolution. @@ -660,9 +697,11 @@ timedelta}, default 'raise' Raises ------ ValueError if the freq cannot be converted. - """) + """, + ) - tz_convert = _make_nat_func('tz_convert', # noqa:E128 + tz_convert = _make_nat_func( + "tz_convert", """ Convert tz-aware Timestamp to another time zone. @@ -680,8 +719,10 @@ timedelta}, default 'raise' ------ TypeError If Timestamp is tz-naive. - """) - tz_localize = _make_nat_func('tz_localize', # noqa:E128 + """, + ) + tz_localize = _make_nat_func( + "tz_localize", """ Convert naive Timestamp to local time zone, or remove timezone from tz-aware Timestamp. @@ -733,8 +774,10 @@ default 'raise' ------ TypeError If the Timestamp is tz-aware and tz is not None. - """) - replace = _make_nat_func('replace', # noqa:E128 + """, + ) + replace = _make_nat_func( + "replace", """ implements datetime.replace, handles nanoseconds. @@ -754,7 +797,8 @@ default 'raise' Returns ------- Timestamp with fields replaced - """) + """, + ) c_NaT = NaTType() # C-visible @@ -764,13 +808,15 @@ NaT = c_NaT # Python-visible # ---------------------------------------------------------------------- cdef inline bint checknull_with_nat(object val): - """ utility to check if a value is a nat or not """ + """ + Utility to check if a value is a nat or not. + """ return val is None or util.is_nan(val) or val is c_NaT or val is C_NA cpdef bint is_null_datetimelike(object val, bint inat_is_null=True): """ - Determine if we have a null for a timedelta/datetime (or integer versions) + Determine if we have a null for a timedelta/datetime (or integer versions). Parameters ---------- @@ -780,7 +826,7 @@ cpdef bint is_null_datetimelike(object val, bint inat_is_null=True): Returns ------- - null_datetimelike : bool + bool """ if val is None: return True diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index ebedee79405e5..c936d42b34db5 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - from cpython.datetime cimport date, datetime from numpy cimport int64_t, int32_t diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index b59a1101e0bf7..9a8a8fdae6d2f 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -1,12 +1,15 @@ from cpython.object cimport Py_EQ, Py_NE, Py_GE, Py_GT, Py_LT, Py_LE -from cpython.datetime cimport (datetime, date, - PyDateTime_IMPORT, - PyDateTime_GET_YEAR, PyDateTime_GET_MONTH, - PyDateTime_GET_DAY, PyDateTime_DATE_GET_HOUR, - PyDateTime_DATE_GET_MINUTE, - PyDateTime_DATE_GET_SECOND, - PyDateTime_DATE_GET_MICROSECOND) +from cpython.datetime cimport ( + PyDateTime_DATE_GET_HOUR, + PyDateTime_DATE_GET_MICROSECOND, + PyDateTime_DATE_GET_MINUTE, + PyDateTime_DATE_GET_SECOND, + PyDateTime_GET_DAY, + PyDateTime_GET_MONTH, + PyDateTime_GET_YEAR, + PyDateTime_IMPORT, +) PyDateTime_IMPORT from numpy cimport int64_t diff --git a/pandas/_libs/tslibs/offsets.pxd b/pandas/_libs/tslibs/offsets.pxd index 2829a27b9905c..5a553be537e52 100644 --- a/pandas/_libs/tslibs/offsets.pxd +++ b/pandas/_libs/tslibs/offsets.pxd @@ -1,3 +1 @@ -# -*- coding: utf-8 -*- - cdef to_offset(object obj) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 48a3886c20a3a..306636278bcbe 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -114,7 +114,18 @@ def apply_index_wraps(func): # Note: normally we would use `@functools.wraps(func)`, but this does # not play nicely with cython class methods def wrapper(self, other): - result = func(self, other) + + is_index = getattr(other, "_typ", "") == "datetimeindex" + + # operate on DatetimeArray + arr = other._data if is_index else other + + result = func(self, arr) + + if is_index: + # Wrap DatetimeArray result back to DatetimeIndex + result = type(other)._simple_new(result, name=other.name) + if self.normalize: result = result.to_period('D').to_timestamp() return result @@ -451,7 +462,7 @@ class _BaseOffset: def _validate_n(self, n): """ - Require that `n` be a nonzero integer. + Require that `n` be an integer. Parameters ---------- @@ -509,7 +520,7 @@ class _BaseOffset: state = self.__dict__.copy() # we don't want to actually pickle the calendar object - # as its a np.busyday; we recreate on deserilization + # as its a np.busyday; we recreate on deserialization if 'calendar' in state: del state['calendar'] try: @@ -598,8 +609,13 @@ cdef inline int month_add_months(npy_datetimestruct dts, int months) nogil: @cython.wraparound(False) @cython.boundscheck(False) -def shift_quarters(int64_t[:] dtindex, int quarters, - int q1start_month, object day, int modby=3): +def shift_quarters( + const int64_t[:] dtindex, + int quarters, + int q1start_month, + object day, + int modby=3, +): """ Given an int64 array representing nanosecond timestamps, shift all elements by the specified number of quarters using DateOffset semantics. @@ -748,7 +764,7 @@ def shift_quarters(int64_t[:] dtindex, int quarters, @cython.wraparound(False) @cython.boundscheck(False) -def shift_months(int64_t[:] dtindex, int months, object day=None): +def shift_months(const int64_t[:] dtindex, int months, object day=None): """ Given an int64-based datetime index, shift all elements specified number of months using DateOffset semantics diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index ebdf7a1e29216..1b980aea372e2 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -189,8 +189,13 @@ cdef inline bint does_string_look_like_time(str parse_string): return 0 <= hour <= 23 and 0 <= minute <= 59 -def parse_datetime_string(date_string: str, freq=None, dayfirst=False, - yearfirst=False, **kwargs): +def parse_datetime_string( + str date_string, + object freq=None, + bint dayfirst=False, + bint yearfirst=False, + **kwargs, +): """ Parse datetime string, only returns datetime. Also cares special handling matching time patterns. @@ -272,8 +277,9 @@ def parse_time_string(arg: str, freq=None, dayfirst=None, yearfirst=None): return res -cdef parse_datetime_string_with_reso(str date_string, freq=None, dayfirst=False, - yearfirst=False): +cdef parse_datetime_string_with_reso( + str date_string, object freq=None, bint dayfirst=False, bint yearfirst=False, +): """ Parse datetime string and try to identify its resolution. @@ -349,7 +355,7 @@ cpdef bint _does_string_look_like_datetime(str py_string): elif py_string in _not_datelike_strings: return False else: - # xstrtod with such paramaters copies behavior of python `float` + # xstrtod with such parameters copies behavior of python `float` # cast; for example, " 35.e-1 " is valid string for this cast so, # for correctly xstrtod call necessary to pass these params: # b'.' - a dot is used as separator, b'e' - an exponential form of @@ -467,8 +473,14 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, raise ValueError(f'Unable to parse {date_string}') -cdef dateutil_parse(str timestr, object default, ignoretz=False, - tzinfos=None, dayfirst=None, yearfirst=None): +cdef dateutil_parse( + str timestr, + object default, + bint ignoretz=False, + object tzinfos=None, + bint dayfirst=False, + bint yearfirst=False, +): """ lifted from dateutil to get resolution""" cdef: @@ -531,8 +543,9 @@ cdef dateutil_parse(str timestr, object default, ignoretz=False, # Parsing for type-inference -def try_parse_dates(object[:] values, parser=None, - dayfirst=False, default=None): +def try_parse_dates( + object[:] values, parser=None, bint dayfirst=False, default=None, +): cdef: Py_ssize_t i, n object[:] result @@ -569,16 +582,21 @@ def try_parse_dates(object[:] values, parser=None, return result.base # .base to access underlying ndarray -def try_parse_date_and_time(object[:] dates, object[:] times, - date_parser=None, time_parser=None, - dayfirst=False, default=None): +def try_parse_date_and_time( + object[:] dates, + object[:] times, + date_parser=None, + time_parser=None, + bint dayfirst=False, + default=None, +): cdef: Py_ssize_t i, n object[:] result n = len(dates) - # Cast to avoid build warning see GH#26757 - if len(times) != n: + # TODO(cython 3.0): Use len instead of `shape[0]` + if times.shape[0] != n: raise ValueError('Length of dates and times must be equal') result = np.empty(n, dtype='O') @@ -607,15 +625,14 @@ def try_parse_date_and_time(object[:] dates, object[:] times, return result.base # .base to access underlying ndarray -def try_parse_year_month_day(object[:] years, object[:] months, - object[:] days): +def try_parse_year_month_day(object[:] years, object[:] months, object[:] days): cdef: Py_ssize_t i, n object[:] result n = len(years) - # Cast to avoid build warning see GH#26757 - if len(months) != n or len(days) != n: + # TODO(cython 3.0): Use len instead of `shape[0]` + if months.shape[0] != n or days.shape[0] != n: raise ValueError('Length of years/months/days must all be equal') result = np.empty(n, dtype='O') @@ -640,10 +657,14 @@ def try_parse_datetime_components(object[:] years, double micros n = len(years) - # Cast to avoid build warning see GH#26757 - if (len(months) != n or len(days) != n or - len(hours) != n or len(minutes) != n or - len(seconds) != n): + # TODO(cython 3.0): Use len instead of `shape[0]` + if ( + months.shape[0] != n + or days.shape[0] != n + or hours.shape[0] != n + or minutes.shape[0] != n + or seconds.shape[0] != n + ): raise ValueError('Length of all datetime components must be equal') result = np.empty(n, dtype='O') @@ -701,6 +722,9 @@ class _timelex: function maintains a "token stack", for when the ambiguous context demands that multiple tokens be parsed at once. """ + cdef: + Py_ssize_t n + stream = self.stream.replace('\x00', '') # TODO: Change \s --> \s+ (this doesn't match existing behavior) @@ -756,15 +780,20 @@ def _format_is_iso(f) -> bint: return False -def _guess_datetime_format(dt_str, dayfirst=False, dt_str_parse=du_parse, - dt_str_split=_DATEUTIL_LEXER_SPLIT): +def _guess_datetime_format( + dt_str, + bint dayfirst=False, + dt_str_parse=du_parse, + dt_str_split=_DATEUTIL_LEXER_SPLIT, +): """ Guess the datetime format of a given datetime string. Parameters ---------- - dt_str : string, datetime string to guess the format of - dayfirst : boolean, default False + dt_str : str + Datetime string to guess the format of. + dayfirst : bool, default False If True parses dates with the day first, eg 20/01/2005 Warning: dayfirst=True is not strict, but will prefer to parse with day first (this is a known bug). @@ -801,6 +830,7 @@ def _guess_datetime_format(dt_str, dayfirst=False, dt_str_parse=du_parse, (('second',), '%S', 2), (('microsecond',), '%f', 6), (('second', 'microsecond'), '%S.%f', 0), + (('tzinfo',), '%Z', 0), ] if dayfirst: @@ -873,8 +903,7 @@ def _guess_datetime_format(dt_str, dayfirst=False, dt_str_parse=du_parse, @cython.wraparound(False) @cython.boundscheck(False) -cdef inline object convert_to_unicode(object item, - bint keep_trivial_numbers): +cdef inline object convert_to_unicode(object item, bint keep_trivial_numbers): """ Convert `item` to str. diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index c3a47902cff0f..dd745f840d0ab 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1,5 +1,3 @@ -from datetime import datetime - from cpython.object cimport PyObject_RichCompareBool, Py_EQ, Py_NE from numpy cimport int64_t, import_array, ndarray @@ -13,6 +11,7 @@ from libc.string cimport strlen, memset import cython from cpython.datetime cimport ( + datetime, PyDate_Check, PyDateTime_Check, PyDateTime_IMPORT, diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.c b/pandas/_libs/tslibs/src/datetime/np_datetime.c index a8a47e2e90f93..f647098140528 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.c @@ -21,7 +21,6 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #endif // NPY_NO_DEPRECATED_API #include -#include #include #include @@ -313,15 +312,14 @@ int cmp_npy_datetimestruct(const npy_datetimestruct *a, * object into a NumPy npy_datetimestruct. Uses tzinfo (if present) * to convert to UTC time. * - * While the C API has PyDate_* and PyDateTime_* functions, the following - * implementation just asks for attributes, and thus supports - * datetime duck typing. The tzinfo time zone conversion would require - * this style of access anyway. + * The following implementation just asks for attributes, and thus + * supports datetime duck typing. The tzinfo time zone conversion + * requires this style of access as well. * * Returns -1 on error, 0 on success, and 1 (with no error set) * if obj doesn't have the needed date or datetime attributes. */ -int convert_pydatetime_to_datetimestruct(PyDateTime_Date *dtobj, +int convert_pydatetime_to_datetimestruct(PyObject *dtobj, npy_datetimestruct *out) { // Assumes that obj is a valid datetime object PyObject *tmp; diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.h b/pandas/_libs/tslibs/src/datetime/np_datetime.h index 549d38409ca83..0bbc24ed822c5 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.h @@ -22,7 +22,6 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #endif // NPY_NO_DEPRECATED_API #include -#include typedef struct { npy_int64 days; @@ -35,7 +34,7 @@ extern const npy_datetimestruct _NS_MAX_DTS; // stuff pandas needs // ---------------------------------------------------------------------------- -int convert_pydatetime_to_datetimestruct(PyDateTime_Date *dtobj, +int convert_pydatetime_to_datetimestruct(PyObject *dtobj, npy_datetimestruct *out); npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index 54ed6ecff21e2..b245ae5880ecb 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -905,3 +905,37 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, outlen); return -1; } + + +int make_iso_8601_timedelta(pandas_timedeltastruct *tds, + char *outstr, size_t *outlen) { + *outlen = 0; + *outlen += snprintf(outstr, 60, // NOLINT + "P%" NPY_INT64_FMT + "DT%" NPY_INT32_FMT + "H%" NPY_INT32_FMT + "M%" NPY_INT32_FMT, + tds->days, tds->hrs, tds->min, tds->sec); + outstr += *outlen; + + if (tds->ns != 0) { + *outlen += snprintf(outstr, 12, // NOLINT + ".%03" NPY_INT32_FMT + "%03" NPY_INT32_FMT + "%03" NPY_INT32_FMT + "S", tds->ms, tds->us, tds->ns); + } else if (tds->us != 0) { + *outlen += snprintf(outstr, 9, // NOLINT + ".%03" NPY_INT32_FMT + "%03" NPY_INT32_FMT + "S", tds->ms, tds->us); + } else if (tds->ms != 0) { + *outlen += snprintf(outstr, 6, // NOLINT + ".%03" NPY_INT32_FMT "S", tds->ms); + } else { + *outlen += snprintf(outstr, 2, // NOLINT + "%s", "S"); + } + + return 0; +} diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h index 880c34ea77638..200a71ff0c2b7 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h @@ -79,4 +79,14 @@ get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base); int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, NPY_DATETIMEUNIT base); + +/* + * Converts an pandas_timedeltastruct to an ISO 8601 string. + * + * Mutates outlen to provide size of (non-NULL terminated) string. + * + * Currently has no error handling + */ +int make_iso_8601_timedelta(pandas_timedeltastruct *tds, char *outstr, + size_t *outlen); #endif // PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_STRINGS_H_ diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index dfe050c7bbff7..ce4d3a4ef8e02 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -4,7 +4,6 @@ import time import locale import calendar import re -from datetime import date as datetime_date from _thread import allocate_lock as _thread_allocate_lock @@ -13,6 +12,7 @@ import pytz import numpy as np from numpy cimport int64_t +cimport cpython.datetime as datetime from pandas._libs.tslibs.np_datetime cimport ( check_dts_bounds, dtstruct_to_dt64, npy_datetimestruct) @@ -288,20 +288,20 @@ def array_strptime(object[:] values, object fmt, bint exact=True, errors='raise' elif iso_year != -1 and iso_week != -1: year, julian = _calc_julian_from_V(iso_year, iso_week, weekday + 1) - # Cannot pre-calculate datetime_date() since can change in Julian + # Cannot pre-calculate datetime.date() since can change in Julian # calculation and thus could have different value for the day of the wk # calculation. try: if julian == -1: # Need to add 1 to result since first day of the year is 1, not # 0. - ordinal = datetime_date(year, month, day).toordinal() - julian = ordinal - datetime_date(year, 1, 1).toordinal() + 1 + ordinal = datetime.date(year, month, day).toordinal() + julian = ordinal - datetime.date(year, 1, 1).toordinal() + 1 else: # Assume that if they bothered to include Julian day it will # be accurate. - datetime_result = datetime_date.fromordinal( - (julian - 1) + datetime_date(year, 1, 1).toordinal()) + datetime_result = datetime.date.fromordinal( + (julian - 1) + datetime.date(year, 1, 1).toordinal()) year = datetime_result.year month = datetime_result.month day = datetime_result.day @@ -311,7 +311,7 @@ def array_strptime(object[:] values, object fmt, bint exact=True, errors='raise' continue raise if weekday == -1: - weekday = datetime_date(year, month, day).weekday() + weekday = datetime.date(year, month, day).weekday() dts.year = year dts.month = month @@ -649,7 +649,7 @@ cdef int _calc_julian_from_U_or_W(int year, int week_of_year, cdef: int first_weekday, week_0_length, days_to_week - first_weekday = datetime_date(year, 1, 1).weekday() + first_weekday = datetime.date(year, 1, 1).weekday() # If we are dealing with the %U directive (week starts on Sunday), it's # easier to just shift the view to Sunday being the first day of the # week. @@ -692,14 +692,14 @@ cdef (int, int) _calc_julian_from_V(int iso_year, int iso_week, int iso_weekday) cdef: int correction, ordinal - correction = datetime_date(iso_year, 1, 4).isoweekday() + 3 + correction = datetime.date(iso_year, 1, 4).isoweekday() + 3 ordinal = (iso_week * 7) + iso_weekday - correction # ordinal may be negative or 0 now, which means the date is in the previous # calendar year if ordinal < 1: - ordinal += datetime_date(iso_year, 1, 1).toordinal() + ordinal += datetime.date(iso_year, 1, 1).toordinal() iso_year -= 1 - ordinal -= datetime_date(iso_year, 1, 1).toordinal() + ordinal -= datetime.date(iso_year, 1, 1).toordinal() return iso_year, ordinal diff --git a/pandas/_libs/tslibs/timedeltas.pxd b/pandas/_libs/tslibs/timedeltas.pxd index 097309b17823b..d7af7636df753 100644 --- a/pandas/_libs/tslibs/timedeltas.pxd +++ b/pandas/_libs/tslibs/timedeltas.pxd @@ -1,8 +1,6 @@ -# -*- coding: utf-8 -*- - from numpy cimport int64_t # Exposed for tslib, not intended for outside use. -cdef int64_t cast_from_unit(object ts, object unit) except? -1 +cdef int64_t cast_from_unit(object ts, str unit) except? -1 cpdef int64_t delta_to_nanoseconds(delta) except? -1 cdef convert_to_timedelta64(object ts, object unit) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 66660c5f641fd..c5092c8630f06 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -82,6 +82,7 @@ cdef dict timedelta_abbrevs = { "us": "us", "microseconds": "us", "microsecond": "us", + "µs": "us", "micro": "us", "micros": "us", "u": "us", @@ -101,7 +102,7 @@ _no_input = object() @cython.boundscheck(False) @cython.wraparound(False) -def ints_to_pytimedelta(int64_t[:] arr, box=False): +def ints_to_pytimedelta(const int64_t[:] arr, box=False): """ convert an i8 repr to an ndarray of timedelta or Timedelta (if box == True) @@ -256,10 +257,15 @@ def array_to_timedelta64(object[:] values, unit='ns', errors='raise'): return iresult.base # .base to access underlying np.ndarray -cpdef inline object precision_from_unit(object unit): +cpdef inline object precision_from_unit(str unit): """ Return a casting of the unit represented to nanoseconds + the precision to round the fractional part. + + Notes + ----- + The caller is responsible for ensuring that the default value of "ns" + takes the place of None. """ cdef: int64_t m @@ -300,7 +306,7 @@ cpdef inline object precision_from_unit(object unit): return m, p -cdef inline int64_t cast_from_unit(object ts, object unit) except? -1: +cdef inline int64_t cast_from_unit(object ts, str unit) except? -1: """ return a casting of the unit represented to nanoseconds round the fractional part of a float to our precision, p """ cdef: @@ -524,15 +530,24 @@ cdef inline timedelta_from_spec(object number, object frac, object unit): return cast_from_unit(float(n), unit) -cpdef inline object parse_timedelta_unit(object unit): +cpdef inline str parse_timedelta_unit(object unit): """ Parameters ---------- - unit : an unit string + unit : str or None + + Returns + ------- + str + Canonical unit string. + + Raises + ------ + ValueError : on non-parseable input """ if unit is None: - return 'ns' - elif unit == 'M': + return "ns" + elif unit == "M": return unit try: return timedelta_abbrevs[unit.lower()] @@ -621,14 +636,14 @@ def _binary_op_method_timedeltalike(op, name): # ---------------------------------------------------------------------- # Timedelta Construction -cdef inline int64_t parse_iso_format_string(object ts) except? -1: +cdef inline int64_t parse_iso_format_string(str ts) except? -1: """ Extracts and cleanses the appropriate values from a match object with groups for each component of an ISO 8601 duration Parameters ---------- - ts: + ts: str ISO 8601 Duration formatted string Returns @@ -763,36 +778,32 @@ cdef class _Timedelta(timedelta): if isinstance(other, _Timedelta): ots = other - elif PyDelta_Check(other) or isinstance(other, Tick): + elif (is_timedelta64_object(other) or PyDelta_Check(other) + or isinstance(other, Tick)): ots = Timedelta(other) - else: - ndim = getattr(other, "ndim", -1) + # TODO: watch out for overflows - if ndim != -1: - if ndim == 0: - if is_timedelta64_object(other): - other = Timedelta(other) - else: - if op == Py_EQ: - return False - elif op == Py_NE: - return True - # only allow ==, != ops - raise TypeError(f'Cannot compare type ' - f'{type(self).__name__} with ' - f'type {type(other).__name__}') - if util.is_array(other): - return PyObject_RichCompare(np.array([self]), other, op) - return PyObject_RichCompare(other, self, reverse_ops[op]) - else: - if other is NaT: - return PyObject_RichCompare(other, self, reverse_ops[op]) - elif op == Py_EQ: - return False - elif op == Py_NE: - return True - raise TypeError(f'Cannot compare type {type(self).__name__} with ' - f'type {type(other).__name__}') + elif other is NaT: + return op == Py_NE + + elif util.is_array(other): + # TODO: watch out for zero-dim + if other.dtype.kind == "m": + return PyObject_RichCompare(self.asm8, other, op) + elif other.dtype.kind == "O": + # operate element-wise + return np.array( + [PyObject_RichCompare(self, x, op) for x in other], + dtype=bool, + ) + if op == Py_EQ: + return np.zeros(other.shape, dtype=bool) + elif op == Py_NE: + return np.ones(other.shape, dtype=bool) + return NotImplemented # let other raise TypeError + + else: + return NotImplemented return cmp_scalar(self.value, ots.value, op) @@ -1071,9 +1082,11 @@ cdef class _Timedelta(timedelta): subs = (self._h or self._m or self._s or self._ms or self._us or self._ns) - # by default not showing nano if self._ms or self._us or self._ns: seconds_fmt = "{seconds:02}.{milliseconds:03}{microseconds:03}" + if self._ns: + # GH#9309 + seconds_fmt += "{nanoseconds:03}" else: seconds_fmt = "{seconds:02}" @@ -1167,7 +1180,7 @@ class Timedelta(_Timedelta): Possible values: - * 'Y', 'M', 'W', 'D', 'T', 'S', 'L', 'U', or 'N' + * 'W', 'D', 'T', 'S', 'L', 'U', or 'N' * 'days' or 'day' * 'hours', 'hour', 'hr', or 'h' * 'minutes', 'minute', 'min', or 'm' @@ -1198,7 +1211,7 @@ class Timedelta(_Timedelta): kwargs = {key: _to_py_int_float(kwargs[key]) for key in kwargs} - nano = np.timedelta64(kwargs.pop('nanoseconds', 0), 'ns') + nano = convert_to_timedelta64(kwargs.pop('nanoseconds', 0), 'ns') try: value = nano + convert_to_timedelta64(timedelta(**kwargs), 'ns') @@ -1407,7 +1420,14 @@ class Timedelta(_Timedelta): # convert to Timedelta below pass + elif util.is_nan(other): + # i.e. np.nan or np.float64("NaN") + raise TypeError("Cannot divide float by Timedelta") + elif hasattr(other, 'dtype'): + if other.dtype.kind == "O": + # GH#31869 + return np.array([x / self for x in other]) return other / self.to_timedelta64() elif not _validate_ops_compat(other): @@ -1415,7 +1435,8 @@ class Timedelta(_Timedelta): other = Timedelta(other) if other is NaT: - return NaT + # In this context we treat NaT as timedelta-like + return np.nan return float(other.value) / self.value def __floordiv__(self, other): diff --git a/pandas/_libs/tslibs/timestamps.pxd b/pandas/_libs/tslibs/timestamps.pxd index 5e55e6e8d5297..3cb4b6cd8113b 100644 --- a/pandas/_libs/tslibs/timestamps.pxd +++ b/pandas/_libs/tslibs/timestamps.pxd @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - from numpy cimport int64_t from pandas._libs.tslibs.np_datetime cimport npy_datetimestruct diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 5cd3467eed042..7858072407a35 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -5,8 +5,7 @@ cimport numpy as cnp from numpy cimport int64_t cnp.import_array() -from datetime import time as datetime_time, timedelta -from cpython.datetime cimport (datetime, PyDateTime_Check, +from cpython.datetime cimport (datetime, time, PyDateTime_Check, PyDelta_Check, PyTZInfo_Check, PyDateTime_IMPORT) PyDateTime_IMPORT @@ -33,7 +32,7 @@ from pandas._libs.tslibs.tzconversion import ( # ---------------------------------------------------------------------- # Constants -_zero_time = datetime_time(0, 0) +_zero_time = time(0, 0) _no_input = object() # ---------------------------------------------------------------------- @@ -879,8 +878,7 @@ default 'raise' raise ValueError('Cannot infer offset with only one time.') nonexistent_options = ('raise', 'NaT', 'shift_forward', 'shift_backward') - if nonexistent not in nonexistent_options and not isinstance( - nonexistent, timedelta): + if nonexistent not in nonexistent_options and not PyDelta_Check(nonexistent): raise ValueError( "The nonexistent argument must be one of 'raise', " "'NaT', 'shift_forward', 'shift_backward' or a timedelta object" @@ -1090,11 +1088,10 @@ default 'raise' def normalize(self): """ - Normalize Timestamp to midnight, preserving - tz information. + Normalize Timestamp to midnight, preserving tz information. """ if self.tz is None or is_utc(self.tz): - DAY_NS = DAY_SECONDS * 1000000000 + DAY_NS = DAY_SECONDS * 1_000_000_000 normalized_value = self.value - (self.value % DAY_NS) return Timestamp(normalized_value).tz_localize(self.tz) normalized_value = normalize_i8_timestamps( @@ -1113,7 +1110,7 @@ cdef int64_t _NS_UPPER_BOUND = np.iinfo(np.int64).max # INT64_MIN + 1 == -9223372036854775807 # but to allow overflow free conversion with a microsecond resolution # use the smallest value with a 0 nanosecond unit (0s in last 3 digits) -cdef int64_t _NS_LOWER_BOUND = -9223372036854775000 +cdef int64_t _NS_LOWER_BOUND = -9_223_372_036_854_775_000 # Resolution is in nanoseconds Timestamp.min = Timestamp(_NS_LOWER_BOUND) diff --git a/pandas/_libs/tslibs/timezones.pxd b/pandas/_libs/tslibs/timezones.pxd index 50c4a41f97a82..6d6ae8f8576ad 100644 --- a/pandas/_libs/tslibs/timezones.pxd +++ b/pandas/_libs/tslibs/timezones.pxd @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - cpdef bint is_utc(object tz) cdef bint is_tzlocal(object tz) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index a9702f91107ec..6915783ac3aaa 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -549,8 +549,9 @@ cdef int64_t _tz_convert_tzlocal_fromutc(int64_t val, tzinfo tz, bint *fold): @cython.boundscheck(False) @cython.wraparound(False) -cdef int64_t[:] _tz_convert_dst(int64_t[:] values, tzinfo tz, - bint to_utc=True): +cdef int64_t[:] _tz_convert_dst( + const int64_t[:] values, tzinfo tz, bint to_utc=True, +): """ tz_convert for non-UTC non-tzlocal cases where we have to check DST transitions pointwise. diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 80b9144042041..f3889039c095e 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -846,7 +846,7 @@ def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp, int64_t win): cdef: float64_t val, res, prev - bint err = 0 + bint err = False int ret = 0 skiplist_t *sl Py_ssize_t i, j @@ -1013,7 +1013,7 @@ def roll_max_variable(ndarray[float64_t] values, ndarray[int64_t] start, def roll_min_fixed(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp, int64_t win): """ - Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. + Moving min of 1d array of any numeric type along axis=0 ignoring NaNs. Parameters ---------- @@ -1030,7 +1030,7 @@ def roll_min_fixed(ndarray[float64_t] values, ndarray[int64_t] start, def roll_min_variable(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): """ - Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. + Moving min of 1d array of any numeric type along axis=0 ignoring NaNs. Parameters ---------- @@ -1051,7 +1051,7 @@ cdef _roll_min_max_variable(ndarray[numeric] values, bint is_max): cdef: numeric ai - int64_t i, close_offset, curr_win_size + int64_t i, k, curr_win_size, start Py_ssize_t nobs = 0, N = len(values) deque Q[int64_t] # min/max always the front deque W[int64_t] # track the whole window for nobs compute @@ -1068,60 +1068,45 @@ cdef _roll_min_max_variable(ndarray[numeric] values, # The original impl didn't deal with variable window sizes # So the code was optimized for that - for i in range(starti[0], endi[0]): - ai = init_mm(values[i], &nobs, is_max) - - # Discard previous entries if we find new min or max - if is_max: - while not Q.empty() and ((ai >= values[Q.back()]) or - values[Q.back()] != values[Q.back()]): - Q.pop_back() - else: - while not Q.empty() and ((ai <= values[Q.back()]) or - values[Q.back()] != values[Q.back()]): - Q.pop_back() - Q.push_back(i) - W.push_back(i) - - # if right is open then the first window is empty - close_offset = 0 if endi[0] > starti[0] else 1 # first window's size curr_win_size = endi[0] - starti[0] + # GH 32865 + # Anchor output index to values index to provide custom + # BaseIndexer support + for i in range(N): - for i in range(endi[0], endi[N-1]): - if not Q.empty() and curr_win_size > 0: - output[i-1+close_offset] = calc_mm( - minp, nobs, values[Q.front()]) - else: - output[i-1+close_offset] = NaN - - ai = init_mm(values[i], &nobs, is_max) - - # Discard previous entries if we find new min or max - if is_max: - while not Q.empty() and ((ai >= values[Q.back()]) or - values[Q.back()] != values[Q.back()]): - Q.pop_back() + curr_win_size = endi[i] - starti[i] + if i == 0: + start = starti[i] else: - while not Q.empty() and ((ai <= values[Q.back()]) or - values[Q.back()] != values[Q.back()]): - Q.pop_back() + start = endi[i - 1] - # Maintain window/nobs retention - curr_win_size = endi[i + close_offset] - starti[i + close_offset] - while not Q.empty() and Q.front() <= i - curr_win_size: + for k in range(start, endi[i]): + ai = init_mm(values[k], &nobs, is_max) + # Discard previous entries if we find new min or max + if is_max: + while not Q.empty() and ((ai >= values[Q.back()]) or + values[Q.back()] != values[Q.back()]): + Q.pop_back() + else: + while not Q.empty() and ((ai <= values[Q.back()]) or + values[Q.back()] != values[Q.back()]): + Q.pop_back() + Q.push_back(k) + W.push_back(k) + + # Discard entries outside and left of current window + while not Q.empty() and Q.front() <= starti[i] - 1: Q.pop_front() - while not W.empty() and W.front() <= i - curr_win_size: + while not W.empty() and W.front() <= starti[i] - 1: remove_mm(values[W.front()], &nobs) W.pop_front() - Q.push_back(i) - W.push_back(i) - - if not Q.empty() and curr_win_size > 0: - output[N-1] = calc_mm(minp, nobs, values[Q.front()]) - else: - output[N-1] = NaN + # Save output based on index in input value array + if not Q.empty() and curr_win_size > 0: + output[i] = calc_mm(minp, nobs, values[Q.front()]) + else: + output[i] = NaN return output diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 2d01d1964c043..8a1e7feb57ace 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -44,6 +44,7 @@ def calculate_variable_window_bounds( cdef: bint left_closed = False bint right_closed = False + int index_growth_sign = 1 ndarray[int64_t, ndim=1] start, end int64_t start_bound, end_bound Py_ssize_t i, j @@ -58,6 +59,9 @@ def calculate_variable_window_bounds( if closed in ['left', 'both']: left_closed = True + if index[num_values - 1] < index[0]: + index_growth_sign = -1 + start = np.empty(num_values, dtype='int64') start.fill(-1) end = np.empty(num_values, dtype='int64') @@ -78,7 +82,7 @@ def calculate_variable_window_bounds( # end is end of slice interval (not including) for i in range(1, num_values): end_bound = index[i] - start_bound = index[i] - window_size + start_bound = index[i] - index_growth_sign * window_size # left endpoint is closed if left_closed: @@ -88,13 +92,13 @@ def calculate_variable_window_bounds( # within the constraint start[i] = i for j in range(start[i - 1], i): - if index[j] > start_bound: + if (index[j] - start_bound) * index_growth_sign > 0: start[i] = j break # end bound is previous end # or current index - if index[end[i - 1]] <= end_bound: + if (index[end[i - 1]] - end_bound) * index_growth_sign <= 0: end[i] = i + 1 else: end[i] = end[i - 1] diff --git a/pandas/_libs/writers.pyx b/pandas/_libs/writers.pyx index 9e95dea979577..2d5b31d7ccbcf 100644 --- a/pandas/_libs/writers.pyx +++ b/pandas/_libs/writers.pyx @@ -36,7 +36,7 @@ def write_csv_rows( """ # In crude testing, N>100 yields little marginal improvement cdef: - Py_ssize_t i, j, k = len(data_index), N = 100, ncols = len(cols) + Py_ssize_t i, j = 0, k = len(data_index), N = 100, ncols = len(cols) list rows # pre-allocate rows @@ -82,7 +82,7 @@ def convert_json_to_lines(arr: object) -> str: """ cdef: Py_ssize_t i = 0, num_open_brackets_seen = 0, length - bint in_quotes = 0, is_escaping = 0 + bint in_quotes = False, is_escaping = False ndarray[uint8_t, ndim=1] narr unsigned char val, newline, comma, left_bracket, right_bracket, quote unsigned char backslash @@ -112,7 +112,7 @@ def convert_json_to_lines(arr: object) -> str: if not in_quotes: num_open_brackets_seen -= 1 - return narr.tostring().decode('utf-8') + return narr.tobytes().decode('utf-8') # stata, pytables diff --git a/pandas/_testing.py b/pandas/_testing.py index a70f75d6cfaf4..1f6b645c821c8 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -32,8 +32,8 @@ is_datetime64tz_dtype, is_extension_array_dtype, is_interval_dtype, - is_list_like, is_number, + is_numeric_dtype, is_period_dtype, is_sequence, is_timedelta64_dtype, @@ -417,10 +417,7 @@ def rands_array(nchars, size, dtype="O"): .view((np.str_, nchars)) .reshape(size) ) - if dtype is None: - return retval - else: - return retval.astype(dtype) + return retval.astype(dtype) def randu_array(nchars, size, dtype="O"): @@ -432,10 +429,7 @@ def randu_array(nchars, size, dtype="O"): .view((np.unicode_, nchars)) .reshape(size) ) - if dtype is None: - return retval - else: - return retval.astype(dtype) + return retval.astype(dtype) def rands(nchars): @@ -448,16 +442,6 @@ def rands(nchars): return "".join(np.random.choice(RANDS_CHARS, nchars)) -def randu(nchars): - """ - Generate one random unicode string. - - See `randu_array` if you want to create an array of random unicode strings. - - """ - return "".join(np.random.choice(RANDU_CHARS, nchars)) - - def close(fignum=None): from matplotlib.pyplot import get_fignums, close as _close @@ -706,11 +690,11 @@ def _get_ilevel_values(index, level): if isinstance(left, pd.PeriodIndex) or isinstance(right, pd.PeriodIndex): assert_attr_equal("freq", left, right, obj=obj) if isinstance(left, pd.IntervalIndex) or isinstance(right, pd.IntervalIndex): - assert_interval_array_equal(left.values, right.values) + assert_interval_array_equal(left._values, right._values) if check_categorical: if is_categorical_dtype(left) or is_categorical_dtype(right): - assert_categorical_equal(left.values, right.values, obj=f"{obj} category") + assert_categorical_equal(left._values, right._values, obj=f"{obj} category") def assert_class_equal(left, right, exact: Union[bool, str] = True, obj="Input"): @@ -724,10 +708,7 @@ def repr_class(x): # return Index as it is to include values in the error message return x - try: - return type(x).__name__ - except AttributeError: - return repr(type(x)) + return type(x).__name__ if exact == "equiv": if type(left) != type(right): @@ -742,9 +723,9 @@ def repr_class(x): raise_assert_detail(obj, msg, repr_class(left), repr_class(right)) -def assert_attr_equal(attr, left, right, obj="Attributes"): +def assert_attr_equal(attr: str, left, right, obj: str = "Attributes"): """ - checks attributes are equal. Both objects must have attribute. + Check attributes are equal. Both objects must have attribute. Parameters ---------- @@ -843,10 +824,14 @@ def assert_categorical_equal( left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes", ) else: + try: + lc = left.categories.sort_values() + rc = right.categories.sort_values() + except TypeError: + # e.g. '<' not supported between instances of 'int' and 'str' + lc, rc = left.categories, right.categories assert_index_equal( - left.categories.sort_values(), - right.categories.sort_values(), - obj=f"{obj}.categories", + lc, rc, obj=f"{obj}.categories", ) assert_index_equal( left.categories.take(left.codes), @@ -883,7 +868,7 @@ def assert_interval_array_equal(left, right, exact="equiv", obj="IntervalArray") def assert_period_array_equal(left, right, obj="PeriodArray"): _check_isinstance(left, right, PeriodArray) - assert_numpy_array_equal(left._data, right._data, obj=f"{obj}.values") + assert_numpy_array_equal(left._data, right._data, obj=f"{obj}._data") assert_attr_equal("freq", left, right, obj=obj) @@ -903,9 +888,16 @@ def assert_timedelta_array_equal(left, right, obj="TimedeltaArray"): assert_attr_equal("freq", left, right, obj=obj) -def raise_assert_detail(obj, message, left, right, diff=None): +def raise_assert_detail(obj, message, left, right, diff=None, index_values=None): __tracebackhide__ = True + msg = f"""{obj} are different + +{message}""" + + if isinstance(index_values, np.ndarray): + msg += f"\n[index]: {pprint_thing(index_values)}" + if isinstance(left, np.ndarray): left = pprint_thing(left) elif is_categorical_dtype(left): @@ -916,9 +908,7 @@ def raise_assert_detail(obj, message, left, right, diff=None): elif is_categorical_dtype(right): right = repr(right) - msg = f"""{obj} are different - -{message} + msg += f""" [left]: {left} [right]: {right}""" @@ -936,6 +926,7 @@ def assert_numpy_array_equal( err_msg=None, check_same=None, obj="numpy array", + index_values=None, ): """ Check that 'np.ndarray' is equivalent. @@ -955,6 +946,8 @@ def assert_numpy_array_equal( obj : str, default 'numpy array' Specify object name being compared, internally used to show appropriate assertion message. + index_values : numpy.ndarray, default None + optional index (shared by both left and right), used in output. """ __tracebackhide__ = True @@ -992,7 +985,7 @@ def _raise(left, right, err_msg): diff = diff * 100.0 / left.size msg = f"{obj} values are different ({np.round(diff, 5)} %)" - raise_assert_detail(obj, msg, left, right) + raise_assert_detail(obj, msg, left, right, index_values=index_values) raise AssertionError(err_msg) @@ -1037,7 +1030,8 @@ def assert_extension_array_equal( if hasattr(left, "asi8") and type(right) == type(left): # Avoid slow object-dtype comparisons - assert_numpy_array_equal(left.asi8, right.asi8) + # np.asarray for case where we have a np.MaskedArray + assert_numpy_array_equal(np.asarray(left.asi8), np.asarray(right.asi8)) return left_na = np.asarray(left.isna()) @@ -1086,7 +1080,7 @@ def assert_series_equal( Whether to check the Index class, dtype and inferred_type are identical. check_series_type : bool, default True - Whether to check the Series class is identical. + Whether to check the Series class is identical. check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. @@ -1106,7 +1100,7 @@ def assert_series_equal( check_categorical : bool, default True Whether to compare internal Categorical exactly. check_category_order : bool, default True - Whether to compare category order of internal Categoricals + Whether to compare category order of internal Categoricals. .. versionadded:: 1.0.2 obj : str, default 'Series' @@ -1119,10 +1113,7 @@ def assert_series_equal( _check_isinstance(left, right, Series) if check_series_type: - # ToDo: There are some tests using rhs is sparse - # lhs is dense. Should use assert_class_equal in future - assert isinstance(left, type(right)) - # assert_class_equal(left, right, obj=obj) + assert_class_equal(left, right, obj=obj) # length comparison if len(left) != len(right): @@ -1147,8 +1138,8 @@ def assert_series_equal( # is False. We'll still raise if only one is a `Categorical`, # regardless of `check_categorical` if ( - is_categorical_dtype(left) - and is_categorical_dtype(right) + is_categorical_dtype(left.dtype) + and is_categorical_dtype(right.dtype) and not check_categorical ): pass @@ -1156,53 +1147,54 @@ def assert_series_equal( assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") if check_exact: + if not is_numeric_dtype(left.dtype): + raise AssertionError("check_exact may only be used with numeric Series") + assert_numpy_array_equal( - left._internal_get_values(), - right._internal_get_values(), + left._values, + right._values, check_dtype=check_dtype, obj=str(obj), + index_values=np.asarray(left.index), ) - elif check_datetimelike_compat: + elif check_datetimelike_compat and ( + needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype) + ): # we want to check only if we have compat dtypes # e.g. integer and M|m are NOT compat, but we can simply check # the values in that case - if needs_i8_conversion(left) or needs_i8_conversion(right): - - # datetimelike may have different objects (e.g. datetime.datetime - # vs Timestamp) but will compare equal - if not Index(left.values).equals(Index(right.values)): - msg = ( - f"[datetimelike_compat=True] {left.values} " - f"is not equal to {right.values}." - ) - raise AssertionError(msg) - else: - assert_numpy_array_equal( - left._internal_get_values(), - right._internal_get_values(), - check_dtype=check_dtype, + + # datetimelike may have different objects (e.g. datetime.datetime + # vs Timestamp) but will compare equal + if not Index(left._values).equals(Index(right._values)): + msg = ( + f"[datetimelike_compat=True] {left._values} " + f"is not equal to {right._values}." ) - elif is_interval_dtype(left) or is_interval_dtype(right): + raise AssertionError(msg) + elif is_interval_dtype(left.dtype) and is_interval_dtype(right.dtype): assert_interval_array_equal(left.array, right.array) - elif is_extension_array_dtype(left.dtype) and is_datetime64tz_dtype(left.dtype): - # .values is an ndarray, but ._values is the ExtensionArray. - # TODO: Use .array - assert is_extension_array_dtype(right.dtype) + elif is_categorical_dtype(left.dtype) or is_categorical_dtype(right.dtype): + _testing.assert_almost_equal( + left._values, + right._values, + check_less_precise=check_less_precise, + check_dtype=check_dtype, + obj=str(obj), + ) + elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype(right.dtype): + assert_extension_array_equal(left._values, right._values) + elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype): + # DatetimeArray or TimedeltaArray assert_extension_array_equal(left._values, right._values) - elif ( - is_extension_array_dtype(left) - and not is_categorical_dtype(left) - and is_extension_array_dtype(right) - and not is_categorical_dtype(right) - ): - assert_extension_array_equal(left.array, right.array) else: _testing.assert_almost_equal( - left._internal_get_values(), - right._internal_get_values(), + left._values, + right._values, check_less_precise=check_less_precise, check_dtype=check_dtype, obj=str(obj), + index_values=np.asarray(left.index), ) # metadata comparison @@ -1212,8 +1204,8 @@ def assert_series_equal( if check_categorical: if is_categorical_dtype(left) or is_categorical_dtype(right): assert_categorical_equal( - left.values, - right.values, + left._values, + right._values, obj=f"{obj} category", check_category_order=check_category_order, ) @@ -1491,14 +1483,7 @@ def to_array(obj): # Sparse -def assert_sp_array_equal( - left, - right, - check_dtype=True, - check_kind=True, - check_fill_value=True, - consolidate_block_indices=False, -): +def assert_sp_array_equal(left, right): """ Check that the left and right SparseArray are equal. @@ -1506,38 +1491,17 @@ def assert_sp_array_equal( ---------- left : SparseArray right : SparseArray - check_dtype : bool, default True - Whether to check the data dtype is identical. - check_kind : bool, default True - Whether to just the kind of the sparse index for each column. - check_fill_value : bool, default True - Whether to check that left.fill_value matches right.fill_value - consolidate_block_indices : bool, default False - Whether to consolidate contiguous blocks for sparse arrays with - a BlockIndex. Some operations, e.g. concat, will end up with - block indices that could be consolidated. Setting this to true will - create a new BlockIndex for that array, with consolidated - block indices. """ _check_isinstance(left, right, pd.arrays.SparseArray) - assert_numpy_array_equal(left.sp_values, right.sp_values, check_dtype=check_dtype) + assert_numpy_array_equal(left.sp_values, right.sp_values) # SparseIndex comparison assert isinstance(left.sp_index, pd._libs.sparse.SparseIndex) assert isinstance(right.sp_index, pd._libs.sparse.SparseIndex) - if not check_kind: - left_index = left.sp_index.to_block_index() - right_index = right.sp_index.to_block_index() - else: - left_index = left.sp_index - right_index = right.sp_index - - if consolidate_block_indices and left.kind == "block": - # we'll probably remove this hack... - left_index = left_index.to_int_index().to_block_index() - right_index = right_index.to_int_index().to_block_index() + left_index = left.sp_index + right_index = right.sp_index if not left_index.equals(right_index): raise_assert_detail( @@ -1547,11 +1511,9 @@ def assert_sp_array_equal( # Just ensure a pass - if check_fill_value: - assert_attr_equal("fill_value", left, right) - if check_dtype: - assert_attr_equal("dtype", left, right) - assert_numpy_array_equal(left.to_dense(), right.to_dense(), check_dtype=check_dtype) + assert_attr_equal("fill_value", left, right) + assert_attr_equal("dtype", left, right) + assert_numpy_array_equal(left.to_dense(), right.to_dense()) # ----------------------------------------------------------------------------- @@ -1734,32 +1696,6 @@ def _make_timeseries(start="2000-01-01", end="2000-12-31", freq="1D", seed=None) return df -def all_index_generator(k=10): - """ - Generator which can be iterated over to get instances of all the various - index classes. - - Parameters - ---------- - k: length of each of the index instances - """ - all_make_index_funcs = [ - makeIntIndex, - makeFloatIndex, - makeStringIndex, - makeUnicodeIndex, - makeDateIndex, - makePeriodIndex, - makeTimedeltaIndex, - makeBoolIndex, - makeRangeIndex, - makeIntervalIndex, - makeCategoricalIndex, - ] - for make_index_func in all_make_index_funcs: - yield make_index_func(k=k) - - def index_subclass_makers_generator(): make_index_funcs = [ makeDateIndex, @@ -2103,53 +2039,6 @@ def _gen_unique_rand(rng, _extra_size): return i.tolist(), j.tolist() -def makeMissingCustomDataframe( - nrows, - ncols, - density=0.9, - random_state=None, - c_idx_names=True, - r_idx_names=True, - c_idx_nlevels=1, - r_idx_nlevels=1, - data_gen_f=None, - c_ndupe_l=None, - r_ndupe_l=None, - dtype=None, - c_idx_type=None, - r_idx_type=None, -): - """ - Parameters - ---------- - Density : float, optional - Float in (0, 1) that gives the percentage of non-missing numbers in - the DataFrame. - random_state : {np.random.RandomState, int}, optional - Random number generator or random seed. - - See makeCustomDataframe for descriptions of the rest of the parameters. - """ - df = makeCustomDataframe( - nrows, - ncols, - c_idx_names=c_idx_names, - r_idx_names=r_idx_names, - c_idx_nlevels=c_idx_nlevels, - r_idx_nlevels=r_idx_nlevels, - data_gen_f=data_gen_f, - c_ndupe_l=c_ndupe_l, - r_ndupe_l=r_ndupe_l, - dtype=dtype, - c_idx_type=c_idx_type, - r_idx_type=r_idx_type, - ) - - i, j = _create_missing_idx(nrows, ncols, density, random_state) - df.values[i, j] = np.nan - return df - - def makeMissingDataframe(density=0.9, random_state=None): df = makeDataFrame() i, j = _create_missing_idx(*df.shape, density=density, random_state=random_state) @@ -2304,7 +2193,7 @@ def network( Notes ----- - * ``raise_on_error`` supercedes ``check_before_test`` + * ``raise_on_error`` supersedes ``check_before_test`` Returns ------- @@ -2397,7 +2286,6 @@ def wrapper(*args, **kwargs): def assert_produces_warning( expected_warning=Warning, filter_level="always", - clear=None, check_stacklevel=True, raise_on_extra_warnings=True, ): @@ -2427,12 +2315,6 @@ class for all warnings. To check that no warning is returned, from each module * "once" - print the warning the first time it is generated - clear : str, default None - If not ``None`` then remove any previously raised warnings from - the ``__warningsregistry__`` to ensure that no warning messages are - suppressed by this context manager. If ``None`` is specified, - the ``__warningsregistry__`` keeps track of which warnings have been - shown, and does not show them again. check_stacklevel : bool, default True If True, displays the line that called the function containing the warning to show were the function is called. Otherwise, the @@ -2465,19 +2347,6 @@ class for all warnings. To check that no warning is returned, with warnings.catch_warnings(record=True) as w: - if clear is not None: - # make sure that we are clearing these warnings - # if they have happened before - # to guarantee that we will catch them - if not is_list_like(clear): - clear = [clear] - for m in clear: - try: - m.__warningregistry__.clear() - except AttributeError: - # module may not have __warningregistry__ - pass - saw_warning = False warnings.simplefilter(filter_level) yield w @@ -2793,3 +2662,34 @@ def external_error_raised( import pytest return pytest.raises(expected_exception, match=None) + + +cython_table = pd.core.base.SelectionMixin._cython_table.items() + + +def get_cython_table_params(ndframe, func_names_and_expected): + """ + Combine frame, functions from SelectionMixin._cython_table + keys and expected result. + + Parameters + ---------- + ndframe : DataFrame or Series + func_names_and_expected : Sequence of two items + The first item is a name of a NDFrame method ('sum', 'prod') etc. + The second item is the expected return value. + + Returns + ------- + list + List of three items (DataFrame, function, expected result) + """ + results = [] + for func_name, expected in func_names_and_expected: + results.append((ndframe, func_name, expected)) + results += [ + (ndframe, func, expected) + for func, name in cython_table + if name == func_name + ] + return results diff --git a/pandas/_typing.py b/pandas/_typing.py index e2858441605f7..850f10bd7f811 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -11,6 +11,7 @@ List, Mapping, Optional, + Type, TypeVar, Union, ) @@ -44,7 +45,9 @@ # other -Dtype = Union[str, np.dtype, "ExtensionDtype"] +Dtype = Union[ + "ExtensionDtype", str, np.dtype, Type[Union[str, float, int, complex, bool]] +] DtypeObj = Union[np.dtype, "ExtensionDtype"] FilePathOrBuffer = Union[str, Path, IO[AnyStr]] @@ -64,7 +67,7 @@ Label = Optional[Hashable] Level = Union[Label, int] Ordered = Optional[bool] -JSONSerializable = Union[PythonScalar, List, Dict] +JSONSerializable = Optional[Union[PythonScalar, List, Dict]] Axes = Collection # For functions like rename that convert one label to another @@ -72,3 +75,7 @@ # to maintain type information across generic functions and parametrization T = TypeVar("T") +# used in decorators to preserve the signature of the function it decorates +# see https://mypy.readthedocs.io/en/stable/generics.html#declaring-decorators +FuncType = Callable[..., Any] +F = TypeVar("F", bound=FuncType) diff --git a/pandas/api/indexers/__init__.py b/pandas/api/indexers/__init__.py index 826297e6b498f..0b36b53675e23 100644 --- a/pandas/api/indexers/__init__.py +++ b/pandas/api/indexers/__init__.py @@ -3,6 +3,6 @@ """ from pandas.core.indexers import check_array_indexer -from pandas.core.window.indexers import BaseIndexer +from pandas.core.window.indexers import BaseIndexer, FixedForwardWindowIndexer -__all__ = ["check_array_indexer", "BaseIndexer"] +__all__ = ["check_array_indexer", "BaseIndexer", "FixedForwardWindowIndexer"] diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 3547a33ea357b..6570e0782a69a 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -12,6 +12,8 @@ import sys import warnings +from pandas._typing import F + PY37 = sys.version_info >= (3, 7) PY38 = sys.version_info >= (3, 8) PYPY = platform.python_implementation() == "PyPy" @@ -25,7 +27,7 @@ # found at https://bitbucket.org/gutworth/six -def set_function_name(f, name, cls): +def set_function_name(f: F, name: str, cls) -> F: """ Bind the name/qualname attributes of the function. """ diff --git a/pandas/conftest.py b/pandas/conftest.py index be44e6c2b36da..e1088dae3925a 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1,8 +1,29 @@ +""" +This file is very long and growing, but it was decided to not split it yet, as +it's still manageable (2020-03-17, ~1.1k LoC). See gh-31989 + +Instead of splitting it was decided to define sections here: +- Configuration / Settings +- Autouse fixtures +- Common arguments +- Missing values & co. +- Classes +- Indices +- Series' +- DataFrames +- Operators & Operations +- Data sets/files +- Time zones +- Dtypes +- Misc +""" + from collections import abc from datetime import date, time, timedelta, timezone from decimal import Decimal import operator import os +from typing import List from dateutil.tz import tzlocal, tzutc import hypothesis @@ -11,6 +32,7 @@ import pytest from pytz import FixedOffset, utc +from pandas._typing import Dtype import pandas.util._test_decorators as td import pandas as pd @@ -19,19 +41,11 @@ from pandas.core import ops from pandas.core.indexes.api import Index, MultiIndex -hypothesis.settings.register_profile( - "ci", - # Hypothesis timing checks are tuned for scalars by default, so we bump - # them from 200ms to 500ms per test case as the global default. If this - # is too short for a specific test, (a) try to make it faster, and (b) - # if it really is slow add `@settings(deadline=...)` with a working value, - # or `deadline=None` to entirely disable timeouts for that test. - deadline=500, - suppress_health_check=(hypothesis.HealthCheck.too_slow,), -) -hypothesis.settings.load_profile("ci") - +# ---------------------------------------------------------------- +# Configuration / Settings +# ---------------------------------------------------------------- +# pytest def pytest_addoption(parser): parser.addoption("--skip-slow", action="store_true", help="skip slow tests") parser.addoption("--skip-network", action="store_true", help="skip network tests") @@ -66,86 +80,542 @@ def pytest_runtest_setup(item): pytest.skip("skipping high memory test since --run-high-memory was not set") -@pytest.fixture(autouse=True) -def configure_tests(): - """ - Configure settings for all tests and test modules. - """ - pd.set_option("chained_assignment", "raise") +# Hypothesis +hypothesis.settings.register_profile( + "ci", + # Hypothesis timing checks are tuned for scalars by default, so we bump + # them from 200ms to 500ms per test case as the global default. If this + # is too short for a specific test, (a) try to make it faster, and (b) + # if it really is slow add `@settings(deadline=...)` with a working value, + # or `deadline=None` to entirely disable timeouts for that test. + deadline=500, + suppress_health_check=(hypothesis.HealthCheck.too_slow,), +) +hypothesis.settings.load_profile("ci") + +# Registering these strategies makes them globally available via st.from_type, +# which is use for offsets in tests/tseries/offsets/test_offsets_properties.py +for name in "MonthBegin MonthEnd BMonthBegin BMonthEnd".split(): + cls = getattr(pd.tseries.offsets, name) + st.register_type_strategy( + cls, st.builds(cls, n=st.integers(-99, 99), normalize=st.booleans()) + ) + +for name in "YearBegin YearEnd BYearBegin BYearEnd".split(): + cls = getattr(pd.tseries.offsets, name) + st.register_type_strategy( + cls, + st.builds( + cls, + n=st.integers(-5, 5), + normalize=st.booleans(), + month=st.integers(min_value=1, max_value=12), + ), + ) + +for name in "QuarterBegin QuarterEnd BQuarterBegin BQuarterEnd".split(): + cls = getattr(pd.tseries.offsets, name) + st.register_type_strategy( + cls, + st.builds( + cls, + n=st.integers(-24, 24), + normalize=st.booleans(), + startingMonth=st.integers(min_value=1, max_value=12), + ), + ) + + +# ---------------------------------------------------------------- +# Autouse fixtures +# ---------------------------------------------------------------- +@pytest.fixture(autouse=True) +def configure_tests(): + """ + Configure settings for all tests and test modules. + """ + pd.set_option("chained_assignment", "raise") + + +@pytest.fixture(autouse=True) +def add_imports(doctest_namespace): + """ + Make `np` and `pd` names available for doctests. + """ + doctest_namespace["np"] = np + doctest_namespace["pd"] = pd + + +# ---------------------------------------------------------------- +# Common arguments +# ---------------------------------------------------------------- +@pytest.fixture(params=[0, 1, "index", "columns"], ids=lambda x: f"axis {repr(x)}") +def axis(request): + """ + Fixture for returning the axis numbers of a DataFrame. + """ + return request.param + + +axis_frame = axis + + +@pytest.fixture(params=[0, "index"], ids=lambda x: f"axis {repr(x)}") +def axis_series(request): + """ + Fixture for returning the axis numbers of a Series. + """ + return request.param + + +@pytest.fixture(params=[True, False, None]) +def observed(request): + """ + Pass in the observed keyword to groupby for [True, False] + This indicates whether categoricals should return values for + values which are not in the grouper [False / None], or only values which + appear in the grouper [True]. [None] is supported for future compatibility + if we decide to change the default (and would need to warn if this + parameter is not passed). + """ + return request.param + + +@pytest.fixture(params=[True, False, None]) +def ordered(request): + """ + Boolean 'ordered' parameter for Categorical. + """ + return request.param + + +@pytest.fixture(params=["first", "last", False]) +def keep(request): + """ + Valid values for the 'keep' parameter used in + .duplicated or .drop_duplicates + """ + return request.param + + +@pytest.fixture(params=["left", "right", "both", "neither"]) +def closed(request): + """ + Fixture for trying all interval closed parameters. + """ + return request.param + + +@pytest.fixture(params=["left", "right", "both", "neither"]) +def other_closed(request): + """ + Secondary closed fixture to allow parametrizing over all pairs of closed. + """ + return request.param + + +@pytest.fixture(params=[None, "gzip", "bz2", "zip", "xz"]) +def compression(request): + """ + Fixture for trying common compression types in compression tests. + """ + return request.param + + +@pytest.fixture(params=["gzip", "bz2", "zip", "xz"]) +def compression_only(request): + """ + Fixture for trying common compression types in compression tests excluding + uncompressed case. + """ + return request.param + + +@pytest.fixture(params=[True, False]) +def writable(request): + """ + Fixture that an array is writable. + """ + return request.param + + +@pytest.fixture(params=["inner", "outer", "left", "right"]) +def join_type(request): + """ + Fixture for trying all types of join operations. + """ + return request.param + + +@pytest.fixture(params=["nlargest", "nsmallest"]) +def nselect_method(request): + """ + Fixture for trying all nselect methods. + """ + return request.param + + +# ---------------------------------------------------------------- +# Missing values & co. +# ---------------------------------------------------------------- +@pytest.fixture(params=[None, np.nan, pd.NaT, float("nan"), np.float("NaN"), pd.NA]) +def nulls_fixture(request): + """ + Fixture for each null type in pandas. + """ + return request.param + + +nulls_fixture2 = nulls_fixture # Generate cartesian product of nulls_fixture + + +@pytest.fixture(params=[None, np.nan, pd.NaT]) +def unique_nulls_fixture(request): + """ + Fixture for each null type in pandas, each null type exactly once. + """ + return request.param + + +# Generate cartesian product of unique_nulls_fixture: +unique_nulls_fixture2 = unique_nulls_fixture + + +# ---------------------------------------------------------------- +# Classes +# ---------------------------------------------------------------- +@pytest.fixture(params=[pd.Index, pd.Series], ids=["index", "series"]) +def index_or_series(request): + """ + Fixture to parametrize over Index and Series, made necessary by a mypy + bug, giving an error: + + List item 0 has incompatible type "Type[Series]"; expected "Type[PandasObject]" + + See GH#29725 + """ + return request.param + + +@pytest.fixture +def dict_subclass(): + """ + Fixture for a dictionary subclass. + """ + + class TestSubDict(dict): + def __init__(self, *args, **kwargs): + dict.__init__(self, *args, **kwargs) + + return TestSubDict + + +@pytest.fixture +def non_dict_mapping_subclass(): + """ + Fixture for a non-mapping dictionary subclass. + """ + + class TestNonDictMapping(abc.Mapping): + def __init__(self, underlying_dict): + self._data = underlying_dict + + def __getitem__(self, key): + return self._data.__getitem__(key) + + def __iter__(self): + return self._data.__iter__() + + def __len__(self): + return self._data.__len__() + + return TestNonDictMapping + + +# ---------------------------------------------------------------- +# Indices +# ---------------------------------------------------------------- +@pytest.fixture +def multiindex_year_month_day_dataframe_random_data(): + """ + DataFrame with 3 level MultiIndex (year, month, day) covering + first 100 business days from 2000-01-01 with random data + """ + tdf = tm.makeTimeDataFrame(100) + ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() + # use Int64Index, to make sure things work + ymd.index.set_levels([lev.astype("i8") for lev in ymd.index.levels], inplace=True) + ymd.index.set_names(["year", "month", "day"], inplace=True) + return ymd + + +def _create_multiindex(): + """ + MultiIndex used to test the general functionality of this object + """ + + # See Also: tests.multi.conftest.idx + major_axis = Index(["foo", "bar", "baz", "qux"]) + minor_axis = Index(["one", "two"]) + + major_codes = np.array([0, 0, 1, 2, 3, 3]) + minor_codes = np.array([0, 1, 0, 1, 0, 1]) + index_names = ["first", "second"] + mi = MultiIndex( + levels=[major_axis, minor_axis], + codes=[major_codes, minor_codes], + names=index_names, + verify_integrity=False, + ) + return mi + + +def _create_mi_with_dt64tz_level(): + """ + MultiIndex with a level that is a tzaware DatetimeIndex. + """ + # GH#8367 round trip with pickle + return MultiIndex.from_product( + [[1, 2], ["a", "b"], pd.date_range("20130101", periods=3, tz="US/Eastern")], + names=["one", "two", "three"], + ) + + +indices_dict = { + "unicode": tm.makeUnicodeIndex(100), + "string": tm.makeStringIndex(100), + "datetime": tm.makeDateIndex(100), + "datetime-tz": tm.makeDateIndex(100, tz="US/Pacific"), + "period": tm.makePeriodIndex(100), + "timedelta": tm.makeTimedeltaIndex(100), + "int": tm.makeIntIndex(100), + "uint": tm.makeUIntIndex(100), + "range": tm.makeRangeIndex(100), + "float": tm.makeFloatIndex(100), + "bool": tm.makeBoolIndex(10), + "categorical": tm.makeCategoricalIndex(100), + "interval": tm.makeIntervalIndex(100), + "empty": Index([]), + "tuples": MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])), + "mi-with-dt64tz-level": _create_mi_with_dt64tz_level(), + "multi": _create_multiindex(), + "repeats": Index([0, 0, 1, 1, 2, 2]), +} + + +@pytest.fixture(params=indices_dict.keys()) +def indices(request): + """ + Fixture for many "simple" kinds of indices. + + These indices are unlikely to cover corner cases, e.g. + - no names + - no NaTs/NaNs + - no values near implementation bounds + - ... + """ + # copy to avoid mutation, e.g. setting .name + return indices_dict[request.param].copy() + + +# Needed to generate cartesian product of indices +index_fixture2 = indices + + +# ---------------------------------------------------------------- +# Series' +# ---------------------------------------------------------------- +@pytest.fixture +def empty_series(): + return pd.Series([], index=[], dtype=np.float64) + + +@pytest.fixture +def string_series(): + """ + Fixture for Series of floats with Index of unique strings + """ + s = tm.makeStringSeries() + s.name = "series" + return s + + +@pytest.fixture +def object_series(): + """ + Fixture for Series of dtype object with Index of unique strings + """ + s = tm.makeObjectSeries() + s.name = "objects" + return s + + +@pytest.fixture +def datetime_series(): + """ + Fixture for Series of floats with DatetimeIndex + """ + s = tm.makeTimeSeries() + s.name = "ts" + return s + + +def _create_series(index): + """ Helper for the _series dict """ + size = len(index) + data = np.random.randn(size) + return pd.Series(data, index=index, name="a") + + +_series = { + f"series-with-{index_id}-index": _create_series(index) + for index_id, index in indices_dict.items() +} -@pytest.fixture(autouse=True) -def add_imports(doctest_namespace): +@pytest.fixture +def series_with_simple_index(indices): """ - Make `np` and `pd` names available for doctests. + Fixture for tests on series with changing types of indices. """ - doctest_namespace["np"] = np - doctest_namespace["pd"] = pd - + return _create_series(indices) -@pytest.fixture(params=["bsr", "coo", "csc", "csr", "dia", "dok", "lil"]) -def spmatrix(request): - """ - Yields scipy sparse matrix classes. - """ - from scipy import sparse - return getattr(sparse, request.param + "_matrix") +_narrow_dtypes = [ + np.float16, + np.float32, + np.int8, + np.int16, + np.int32, + np.uint8, + np.uint16, + np.uint32, +] +_narrow_series = { + f"{dtype.__name__}-series": tm.makeFloatSeries(name="a").astype(dtype) + for dtype in _narrow_dtypes +} -@pytest.fixture(params=[0, 1, "index", "columns"], ids=lambda x: f"axis {repr(x)}") -def axis(request): +@pytest.fixture(params=_narrow_series.keys()) +def narrow_series(request): """ - Fixture for returning the axis numbers of a DataFrame. + Fixture for Series with low precision data types """ - return request.param + # copy to avoid mutation, e.g. setting .name + return _narrow_series[request.param].copy() -axis_frame = axis +_index_or_series_objs = {**indices_dict, **_series, **_narrow_series} -@pytest.fixture(params=[0, "index"], ids=lambda x: f"axis {repr(x)}") -def axis_series(request): +@pytest.fixture(params=_index_or_series_objs.keys()) +def index_or_series_obj(request): """ - Fixture for returning the axis numbers of a Series. + Fixture for tests on indexes, series and series with a narrow dtype + copy to avoid mutation, e.g. setting .name """ - return request.param + return _index_or_series_objs[request.param].copy(deep=True) +# ---------------------------------------------------------------- +# DataFrames +# ---------------------------------------------------------------- @pytest.fixture -def ip(): +def empty_frame(): + return DataFrame() + + +@pytest.fixture +def int_frame(): + """ + Fixture for DataFrame of ints with index of unique strings + + Columns are ['A', 'B', 'C', 'D'] + + A B C D + vpBeWjM651 1 0 1 0 + 5JyxmrP1En -1 0 0 0 + qEDaoD49U2 -1 1 0 0 + m66TkTfsFe 0 0 0 0 + EHPaNzEUFm -1 0 -1 0 + fpRJCevQhi 2 0 0 0 + OlQvnmfi3Q 0 0 -2 0 + ... .. .. .. .. + uB1FPlz4uP 0 0 0 1 + EcSe6yNzCU 0 0 -1 0 + L50VudaiI8 -1 1 -2 0 + y3bpw4nwIp 0 -1 0 0 + H0RdLLwrCT 1 1 0 0 + rY82K0vMwm 0 0 0 0 + 1OPIUjnkjk 2 0 0 0 + + [30 rows x 4 columns] """ - Get an instance of IPython.InteractiveShell. + return DataFrame(tm.getSeriesData()).astype("int64") - Will raise a skip if IPython is not installed. + +@pytest.fixture +def datetime_frame(): """ - pytest.importorskip("IPython", minversion="6.0.0") - from IPython.core.interactiveshell import InteractiveShell + Fixture for DataFrame of floats with DatetimeIndex - return InteractiveShell() + Columns are ['A', 'B', 'C', 'D'] + A B C D + 2000-01-03 -1.122153 0.468535 0.122226 1.693711 + 2000-01-04 0.189378 0.486100 0.007864 -1.216052 + 2000-01-05 0.041401 -0.835752 -0.035279 -0.414357 + 2000-01-06 0.430050 0.894352 0.090719 0.036939 + 2000-01-07 -0.620982 -0.668211 -0.706153 1.466335 + 2000-01-10 -0.752633 0.328434 -0.815325 0.699674 + 2000-01-11 -2.236969 0.615737 -0.829076 -1.196106 + ... ... ... ... ... + 2000-02-03 1.642618 -0.579288 0.046005 1.385249 + 2000-02-04 -0.544873 -1.160962 -0.284071 -1.418351 + 2000-02-07 -2.656149 -0.601387 1.410148 0.444150 + 2000-02-08 -1.201881 -1.289040 0.772992 -1.445300 + 2000-02-09 1.377373 0.398619 1.008453 -0.928207 + 2000-02-10 0.473194 -0.636677 0.984058 0.511519 + 2000-02-11 -0.965556 0.408313 -1.312844 -0.381948 -@pytest.fixture(params=[True, False, None]) -def observed(request): - """ - Pass in the observed keyword to groupby for [True, False] - This indicates whether categoricals should return values for - values which are not in the grouper [False / None], or only values which - appear in the grouper [True]. [None] is supported for future compatibility - if we decide to change the default (and would need to warn if this - parameter is not passed). + [30 rows x 4 columns] """ - return request.param + return DataFrame(tm.getTimeSeriesData()) -@pytest.fixture(params=[True, False, None]) -def ordered_fixture(request): +@pytest.fixture +def float_frame(): """ - Boolean 'ordered' parameter for Categorical. + Fixture for DataFrame of floats with index of unique strings + + Columns are ['A', 'B', 'C', 'D']. + + A B C D + P7GACiRnxd -0.465578 -0.361863 0.886172 -0.053465 + qZKh6afn8n -0.466693 -0.373773 0.266873 1.673901 + tkp0r6Qble 0.148691 -0.059051 0.174817 1.598433 + wP70WOCtv8 0.133045 -0.581994 -0.992240 0.261651 + M2AeYQMnCz -1.207959 -0.185775 0.588206 0.563938 + QEPzyGDYDo -0.381843 -0.758281 0.502575 -0.565053 + r78Jwns6dn -0.653707 0.883127 0.682199 0.206159 + ... ... ... ... ... + IHEGx9NO0T -0.277360 0.113021 -1.018314 0.196316 + lPMj8K27FA -1.313667 -0.604776 -1.305618 -0.863999 + qa66YMWQa5 1.110525 0.475310 -0.747865 0.032121 + yOa0ATsmcE -0.431457 0.067094 0.096567 -0.264962 + 65znX3uRNG 1.528446 0.160416 -0.109635 -0.032987 + eCOBvKqf3e 0.235281 1.622222 0.781255 0.392871 + xSucinXxuV -1.263557 0.252799 -0.552247 0.400426 + + [30 rows x 4 columns] """ - return request.param + return DataFrame(tm.getSeriesData()) +# ---------------------------------------------------------------- +# Operators & Operations +# ---------------------------------------------------------------- _all_arithmetic_operators = [ "__add__", "__radd__", @@ -235,46 +705,6 @@ def all_boolean_reductions(request): return request.param -_cython_table = pd.core.base.SelectionMixin._cython_table.items() - - -@pytest.fixture(params=list(_cython_table)) -def cython_table_items(request): - """ - Yields a tuple of a function and its corresponding name. Correspond to - the list of aggregator "Cython functions" used on selected table items. - """ - return request.param - - -def _get_cython_table_params(ndframe, func_names_and_expected): - """ - Combine frame, functions from SelectionMixin._cython_table - keys and expected result. - - Parameters - ---------- - ndframe : DataFrame or Series - func_names_and_expected : Sequence of two items - The first item is a name of a NDFrame method ('sum', 'prod') etc. - The second item is the expected return value. - - Returns - ------- - list - List of three items (DataFrame, function, expected result) - """ - results = [] - for func_name, expected in func_names_and_expected: - results.append((ndframe, func_name, expected)) - results += [ - (ndframe, func, expected) - for func, name in _cython_table - if name == func_name - ] - return results - - @pytest.fixture(params=["__eq__", "__ne__", "__le__", "__lt__", "__ge__", "__gt__"]) def all_compare_operators(request): """ @@ -317,55 +747,9 @@ def all_logical_operators(request): return request.param -@pytest.fixture(params=[None, "gzip", "bz2", "zip", "xz"]) -def compression(request): - """ - Fixture for trying common compression types in compression tests. - """ - return request.param - - -@pytest.fixture(params=["gzip", "bz2", "zip", "xz"]) -def compression_only(request): - """ - Fixture for trying common compression types in compression tests excluding - uncompressed case. - """ - return request.param - - -@pytest.fixture(params=[True, False]) -def writable(request): - """ - Fixture that an array is writable. - """ - return request.param - - -@pytest.fixture(scope="module") -def datetime_tz_utc(): - """ - Yields the UTC timezone object from the datetime module. - """ - return timezone.utc - - -@pytest.fixture(params=["utc", "dateutil/UTC", utc, tzutc(), timezone.utc]) -def utc_fixture(request): - """ - Fixture to provide variants of UTC timezone strings and tzinfo objects. - """ - return request.param - - -@pytest.fixture(params=["inner", "outer", "left", "right"]) -def join_type(request): - """ - Fixture for trying all types of join operations. - """ - return request.param - - +# ---------------------------------------------------------------- +# Data sets/files +# ---------------------------------------------------------------- @pytest.fixture def strict_data_files(pytestconfig): """ @@ -417,53 +801,9 @@ def iris(datapath): return pd.read_csv(datapath("data", "iris.csv")) -@pytest.fixture(params=["nlargest", "nsmallest"]) -def nselect_method(request): - """ - Fixture for trying all nselect methods. - """ - return request.param - - -@pytest.fixture(params=["left", "right", "both", "neither"]) -def closed(request): - """ - Fixture for trying all interval closed parameters. - """ - return request.param - - -@pytest.fixture(params=["left", "right", "both", "neither"]) -def other_closed(request): - """ - Secondary closed fixture to allow parametrizing over all pairs of closed. - """ - return request.param - - -@pytest.fixture(params=[None, np.nan, pd.NaT, float("nan"), np.float("NaN"), pd.NA]) -def nulls_fixture(request): - """ - Fixture for each null type in pandas. - """ - return request.param - - -nulls_fixture2 = nulls_fixture # Generate cartesian product of nulls_fixture - - -@pytest.fixture(params=[None, np.nan, pd.NaT]) -def unique_nulls_fixture(request): - """ - Fixture for each null type in pandas, each null type exactly once. - """ - return request.param - - -# Generate cartesian product of unique_nulls_fixture: -unique_nulls_fixture2 = unique_nulls_fixture - - +# ---------------------------------------------------------------- +# Time zones +# ---------------------------------------------------------------- TIMEZONES = [ None, "UTC", @@ -487,38 +827,54 @@ def unique_nulls_fixture(request): @pytest.fixture(params=TIMEZONES, ids=TIMEZONE_IDS) def tz_naive_fixture(request): """ - Fixture for trying timezones including default (None): {0} + Fixture for trying timezones including default (None): {0} + """ + return request.param + + +@td.parametrize_fixture_doc(str(TIMEZONE_IDS[1:])) +@pytest.fixture(params=TIMEZONES[1:], ids=TIMEZONE_IDS[1:]) +def tz_aware_fixture(request): + """ + Fixture for trying explicit timezones: {0} + """ + return request.param + + +# Generate cartesian product of tz_aware_fixture: +tz_aware_fixture2 = tz_aware_fixture + + +@pytest.fixture(scope="module") +def datetime_tz_utc(): + """ + Yields the UTC timezone object from the datetime module. """ - return request.param + return timezone.utc -@td.parametrize_fixture_doc(str(TIMEZONE_IDS[1:])) -@pytest.fixture(params=TIMEZONES[1:], ids=TIMEZONE_IDS[1:]) -def tz_aware_fixture(request): +@pytest.fixture(params=["utc", "dateutil/UTC", utc, tzutc(), timezone.utc]) +def utc_fixture(request): """ - Fixture for trying explicit timezones: {0} + Fixture to provide variants of UTC timezone strings and tzinfo objects. """ return request.param -# Generate cartesian product of tz_aware_fixture: -tz_aware_fixture2 = tz_aware_fixture - - # ---------------------------------------------------------------- # Dtypes # ---------------------------------------------------------------- UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"] UNSIGNED_EA_INT_DTYPES = ["UInt8", "UInt16", "UInt32", "UInt64"] -SIGNED_INT_DTYPES = [int, "int8", "int16", "int32", "int64"] +SIGNED_INT_DTYPES: List[Dtype] = [int, "int8", "int16", "int32", "int64"] SIGNED_EA_INT_DTYPES = ["Int8", "Int16", "Int32", "Int64"] ALL_INT_DTYPES = UNSIGNED_INT_DTYPES + SIGNED_INT_DTYPES ALL_EA_INT_DTYPES = UNSIGNED_EA_INT_DTYPES + SIGNED_EA_INT_DTYPES -FLOAT_DTYPES = [float, "float32", "float64"] -COMPLEX_DTYPES = [complex, "complex64", "complex128"] -STRING_DTYPES = [str, "str", "U"] +FLOAT_DTYPES: List[Dtype] = [float, "float32", "float64"] +COMPLEX_DTYPES: List[Dtype] = [complex, "complex64", "complex128"] +STRING_DTYPES: List[Dtype] = [str, "str", "U"] DATETIME64_DTYPES = ["datetime64[ns]", "M8[ns]"] TIMEDELTA64_DTYPES = ["timedelta64[ns]", "m8[ns]"] @@ -818,242 +1174,50 @@ def any_skipna_inferred_dtype(request): return inferred_dtype, values -@pytest.fixture( - params=[ - getattr(pd.offsets, o) - for o in pd.offsets.__all__ - if issubclass(getattr(pd.offsets, o), pd.offsets.Tick) - ] -) -def tick_classes(request): - """ - Fixture for Tick based datetime offsets available for a time series. - """ - return request.param - - # ---------------------------------------------------------------- -# Global setup for tests using Hypothesis - - -# Registering these strategies makes them globally available via st.from_type, -# which is use for offsets in tests/tseries/offsets/test_offsets_properties.py -for name in "MonthBegin MonthEnd BMonthBegin BMonthEnd".split(): - cls = getattr(pd.tseries.offsets, name) - st.register_type_strategy( - cls, st.builds(cls, n=st.integers(-99, 99), normalize=st.booleans()) - ) - -for name in "YearBegin YearEnd BYearBegin BYearEnd".split(): - cls = getattr(pd.tseries.offsets, name) - st.register_type_strategy( - cls, - st.builds( - cls, - n=st.integers(-5, 5), - normalize=st.booleans(), - month=st.integers(min_value=1, max_value=12), - ), - ) - -for name in "QuarterBegin QuarterEnd BQuarterBegin BQuarterEnd".split(): - cls = getattr(pd.tseries.offsets, name) - st.register_type_strategy( - cls, - st.builds( - cls, - n=st.integers(-24, 24), - normalize=st.booleans(), - startingMonth=st.integers(min_value=1, max_value=12), - ), - ) - - -@pytest.fixture -def datetime_series(): - """ - Fixture for Series of floats with DatetimeIndex - """ - s = tm.makeTimeSeries() - s.name = "ts" - return s - - -@pytest.fixture -def float_frame(): - """ - Fixture for DataFrame of floats with index of unique strings - - Columns are ['A', 'B', 'C', 'D']. - - A B C D - P7GACiRnxd -0.465578 -0.361863 0.886172 -0.053465 - qZKh6afn8n -0.466693 -0.373773 0.266873 1.673901 - tkp0r6Qble 0.148691 -0.059051 0.174817 1.598433 - wP70WOCtv8 0.133045 -0.581994 -0.992240 0.261651 - M2AeYQMnCz -1.207959 -0.185775 0.588206 0.563938 - QEPzyGDYDo -0.381843 -0.758281 0.502575 -0.565053 - r78Jwns6dn -0.653707 0.883127 0.682199 0.206159 - ... ... ... ... ... - IHEGx9NO0T -0.277360 0.113021 -1.018314 0.196316 - lPMj8K27FA -1.313667 -0.604776 -1.305618 -0.863999 - qa66YMWQa5 1.110525 0.475310 -0.747865 0.032121 - yOa0ATsmcE -0.431457 0.067094 0.096567 -0.264962 - 65znX3uRNG 1.528446 0.160416 -0.109635 -0.032987 - eCOBvKqf3e 0.235281 1.622222 0.781255 0.392871 - xSucinXxuV -1.263557 0.252799 -0.552247 0.400426 - - [30 rows x 4 columns] - """ - return DataFrame(tm.getSeriesData()) - - -@pytest.fixture(params=[pd.Index, pd.Series], ids=["index", "series"]) -def index_or_series(request): - """ - Fixture to parametrize over Index and Series, made necessary by a mypy - bug, giving an error: - - List item 0 has incompatible type "Type[Series]"; expected "Type[PandasObject]" - - See GH#29725 - """ - return request.param - - +# Misc +# ---------------------------------------------------------------- @pytest.fixture -def dict_subclass(): - """ - Fixture for a dictionary subclass. +def ip(): """ + Get an instance of IPython.InteractiveShell. - class TestSubDict(dict): - def __init__(self, *args, **kwargs): - dict.__init__(self, *args, **kwargs) - - return TestSubDict - - -@pytest.fixture -def non_mapping_dict_subclass(): - """ - Fixture for a non-mapping dictionary subclass. + Will raise a skip if IPython is not installed. """ + pytest.importorskip("IPython", minversion="6.0.0") + from IPython.core.interactiveshell import InteractiveShell - class TestNonDictMapping(abc.Mapping): - def __init__(self, underlying_dict): - self._data = underlying_dict - - def __getitem__(self, key): - return self._data.__getitem__(key) - - def __iter__(self): - return self._data.__iter__() - - def __len__(self): - return self._data.__len__() - - return TestNonDictMapping - - -def _gen_mi(): - # a MultiIndex used to test the general functionality of this object - - # See Also: tests.multi.conftest.idx - major_axis = Index(["foo", "bar", "baz", "qux"]) - minor_axis = Index(["one", "two"]) - - major_codes = np.array([0, 0, 1, 2, 3, 3]) - minor_codes = np.array([0, 1, 0, 1, 0, 1]) - index_names = ["first", "second"] - mi = MultiIndex( - levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes], - names=index_names, - verify_integrity=False, - ) - return mi - - -indices_dict = { - "unicode": tm.makeUnicodeIndex(100), - "string": tm.makeStringIndex(100), - "datetime": tm.makeDateIndex(100), - "datetime-tz": tm.makeDateIndex(100, tz="US/Pacific"), - "period": tm.makePeriodIndex(100), - "timedelta": tm.makeTimedeltaIndex(100), - "int": tm.makeIntIndex(100), - "uint": tm.makeUIntIndex(100), - "range": tm.makeRangeIndex(100), - "float": tm.makeFloatIndex(100), - "bool": tm.makeBoolIndex(10), - "categorical": tm.makeCategoricalIndex(100), - "interval": tm.makeIntervalIndex(100), - "empty": Index([]), - "tuples": MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])), - "multi": _gen_mi(), - "repeats": Index([0, 0, 1, 1, 2, 2]), -} + return InteractiveShell() -@pytest.fixture(params=indices_dict.keys()) -def indices(request): +@pytest.fixture(params=["bsr", "coo", "csc", "csr", "dia", "dok", "lil"]) +def spmatrix(request): """ - Fixture for many "simple" kinds of indices. - - These indices are unlikely to cover corner cases, e.g. - - no names - - no NaTs/NaNs - - no values near implementation bounds - - ... + Yields scipy sparse matrix classes. """ - # copy to avoid mutation, e.g. setting .name - return indices_dict[request.param].copy() - - -def _create_series(index): - """ Helper for the _series dict """ - size = len(index) - data = np.random.randn(size) - return pd.Series(data, index=index, name="a") - + from scipy import sparse -_series = { - f"series-with-{index_id}-index": _create_series(index) - for index_id, index in indices_dict.items() -} + return getattr(sparse, request.param + "_matrix") -@pytest.fixture -def series_with_simple_index(indices): +@pytest.fixture(params=list(tm.cython_table)) +def cython_table_items(request): """ - Fixture for tests on series with changing types of indices. + Yields a tuple of a function and its corresponding name. Correspond to + the list of aggregator "Cython functions" used on selected table items. """ - return _create_series(indices) - - -_narrow_dtypes = [ - np.float16, - np.float32, - np.int8, - np.int16, - np.int32, - np.uint8, - np.uint16, - np.uint32, -] -_narrow_series = { - f"{dtype.__name__}-series": tm.makeFloatSeries(name="a").astype(dtype) - for dtype in _narrow_dtypes -} - -_index_or_series_objs = {**indices_dict, **_series, **_narrow_series} + return request.param -@pytest.fixture(params=_index_or_series_objs.keys()) -def index_or_series_obj(request): +@pytest.fixture( + params=[ + getattr(pd.offsets, o) + for o in pd.offsets.__all__ + if issubclass(getattr(pd.offsets, o), pd.offsets.Tick) + ] +) +def tick_classes(request): """ - Fixture for tests on indexes, series and series with a narrow dtype - copy to avoid mutation, e.g. setting .name + Fixture for Tick based datetime offsets available for a time series. """ - return _index_or_series_objs[request.param].copy(deep=True) + return request.param diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index fc40f1db1918a..f970cefe15527 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -211,7 +211,9 @@ def _register_accessor(name, cls): See Also -------- - {others} + register_dataframe_accessor : Register a custom accessor on DataFrame objects. + register_series_accessor : Register a custom accessor on Series objects. + register_index_accessor : Register a custom accessor on Index objects. Notes ----- @@ -255,12 +257,13 @@ def plot(self): Back in an interactive IPython session: - >>> ds = pd.DataFrame({{'longitude': np.linspace(0, 10), - ... 'latitude': np.linspace(0, 20)}}) - >>> ds.geo.center - (5.0, 10.0) - >>> ds.geo.plot() - # plots data on a map + .. code-block:: ipython + + In [1]: ds = pd.DataFrame({{"longitude": np.linspace(0, 10), + ...: "latitude": np.linspace(0, 20)}}) + In [2]: ds.geo.center + Out[2]: (5.0, 10.0) + In [3]: ds.geo.plot() # plots data on a map """ def decorator(accessor): @@ -279,33 +282,21 @@ def decorator(accessor): return decorator -@doc( - _register_accessor, - klass="DataFrame", - others="register_series_accessor, register_index_accessor", -) +@doc(_register_accessor, klass="DataFrame") def register_dataframe_accessor(name): from pandas import DataFrame return _register_accessor(name, DataFrame) -@doc( - _register_accessor, - klass="Series", - others="register_dataframe_accessor, register_index_accessor", -) +@doc(_register_accessor, klass="Series") def register_series_accessor(name): from pandas import Series return _register_accessor(name, Series) -@doc( - _register_accessor, - klass="Index", - others="register_dataframe_accessor, register_series_accessor", -) +@doc(_register_accessor, klass="Index") def register_index_accessor(name): from pandas import Index diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 448f84d58d7a0..f6380808d5ac2 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -27,10 +27,9 @@ def is_multi_agg_with_relabel(**kwargs) -> bool: Examples -------- - >>> is_multi_agg_with_relabel(a='max') + >>> is_multi_agg_with_relabel(a="max") False - >>> is_multi_agg_with_relabel(a_max=('a', 'max'), - ... a_min=('a', 'min')) + >>> is_multi_agg_with_relabel(a_max=("a", "max"), a_min=("a", "min")) True >>> is_multi_agg_with_relabel() False @@ -61,8 +60,8 @@ def normalize_keyword_aggregation(kwargs: dict) -> Tuple[dict, List[str], List[i Examples -------- - >>> normalize_keyword_aggregation({'output': ('input', 'sum')}) - ({'input': ['sum']}, ('output',), [('input', 'sum')]) + >>> normalize_keyword_aggregation({"output": ("input", "sum")}) + (defaultdict(, {'input': ['sum']}), ('output',), array([0])) """ # Normalize the aggregation functions as Mapping[column, List[func]], # process normally, then fixup the names. diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 02a979aea6c6b..62a3808d36ba2 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -11,6 +11,7 @@ from pandas._libs import Timestamp, algos, hashtable as htable, lib from pandas._libs.tslib import iNaT +from pandas._typing import AnyArrayLike from pandas.util._decorators import doc from pandas.core.dtypes.cast import ( @@ -45,10 +46,14 @@ is_unsigned_integer_dtype, needs_i8_conversion, ) -from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ( + ABCExtensionArray, + ABCIndex, + ABCIndexClass, + ABCSeries, +) from pandas.core.dtypes.missing import isna, na_value_for_dtype -import pandas.core.common as com from pandas.core.construction import array, extract_array from pandas.core.indexers import validate_indices @@ -313,8 +318,8 @@ def unique(values): See Also -------- - Index.unique - Series.unique + Index.unique : Return unique values from an Index. + Series.unique : Return unique values of Series object. Examples -------- @@ -384,7 +389,7 @@ def unique(values): unique1d = unique -def isin(comps, values) -> np.ndarray: +def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: """ Compute the isin boolean array. @@ -409,15 +414,14 @@ def isin(comps, values) -> np.ndarray: f"to isin(), you passed a [{type(values).__name__}]" ) - if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): + if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)): values = construct_1d_object_array_from_listlike(list(values)) + comps = extract_array(comps, extract_numpy=True) if is_categorical_dtype(comps): # TODO(extension) # handle categoricals - return comps._values.isin(values) - - comps = com.values_from_object(comps) + return comps.isin(values) # type: ignore comps, dtype = _ensure_data(comps) values, _ = _ensure_data(values, dtype=dtype) @@ -552,7 +556,7 @@ def factorize( >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b']) >>> codes - array([0, 0, 1, 2, 0]) + array([0, 0, 1, 2, 0]...) >>> uniques array(['b', 'a', 'c'], dtype=object) @@ -561,7 +565,7 @@ def factorize( >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True) >>> codes - array([1, 1, 0, 2, 1]) + array([1, 1, 0, 2, 1]...) >>> uniques array(['a', 'b', 'c'], dtype=object) @@ -571,7 +575,7 @@ def factorize( >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) >>> codes - array([ 0, -1, 1, 2, 0]) + array([ 0, -1, 1, 2, 0]...) >>> uniques array(['b', 'a', 'c'], dtype=object) @@ -582,7 +586,7 @@ def factorize( >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c']) >>> codes, uniques = pd.factorize(cat) >>> codes - array([0, 0, 1]) + array([0, 0, 1]...) >>> uniques [a, c] Categories (3, object): [a, b, c] @@ -596,7 +600,7 @@ def factorize( >>> cat = pd.Series(['a', 'a', 'c']) >>> codes, uniques = pd.factorize(cat) >>> codes - array([0, 0, 1]) + array([0, 0, 1]...) >>> uniques Index(['a', 'c'], dtype='object') """ @@ -686,8 +690,8 @@ def value_counts( values = Series(values) try: ii = cut(values, bins, include_lowest=True) - except TypeError: - raise TypeError("bins argument only works with numeric data.") + except TypeError as err: + raise TypeError("bins argument only works with numeric data.") from err # count, remove nulls (from the index), and but the bins result = ii.value_counts(dropna=dropna) @@ -696,7 +700,7 @@ def value_counts( result = result.sort_index() # if we are dropna and we have NO values - if dropna and (result.values == 0).all(): + if dropna and (result._values == 0).all(): result = result.iloc[0:0] # normalizing is by len of all (regardless of dropna) @@ -709,7 +713,7 @@ def value_counts( # handle Categorical and sparse, result = Series(values)._values.value_counts(dropna=dropna) result.name = name - counts = result.values + counts = result._values else: keys, counts = _value_counts_arraylike(values, dropna) @@ -819,7 +823,7 @@ def mode(values, dropna: bool = True) -> "Series": # categorical is a fast-path if is_categorical_dtype(values): if isinstance(values, Series): - return Series(values.values.mode(dropna=dropna), name=values.name) + return Series(values._values.mode(dropna=dropna), name=values.name) return values.mode(dropna=dropna) if dropna and needs_i8_conversion(values.dtype): @@ -1515,7 +1519,7 @@ def take(arr, indices, axis: int = 0, allow_fill: bool = False, fill_value=None) See Also -------- - numpy.take + numpy.take : Take elements from an array along an axis. Examples -------- @@ -1641,7 +1645,7 @@ def take_nd( if arr.flags.f_contiguous and axis == arr.ndim - 1: # minor tweak that can make an order-of-magnitude difference # for dataframes initialized directly from 2-d ndarrays - # (s.t. df.values is c-contiguous and df._data.blocks[0] is its + # (s.t. df.values is c-contiguous and df._mgr.blocks[0] is its # f-contiguous transpose) out = np.empty(out_shape, dtype=dtype, order="F") else: @@ -2021,9 +2025,7 @@ def sort_mixed(values): ) codes = ensure_platform_int(np.asarray(codes)) - from pandas import Index - - if not assume_unique and not Index(values).is_unique: + if not assume_unique and not len(unique(values)) == len(values): raise ValueError("values should be unique if codes is not None") if sorter is None: diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 70e0a129c055f..a013434491589 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -165,10 +165,9 @@ def get_result(self): # ufunc elif isinstance(self.f, np.ufunc): with np.errstate(all="ignore"): - results = self.obj._data.apply("apply", func=self.f) - return self.obj._constructor( - data=results, index=self.index, columns=self.columns, copy=False - ) + results = self.obj._mgr.apply("apply", func=self.f) + # _constructor will retain self.index and self.columns + return self.obj._constructor(data=results) # broadcasting if self.result_type == "broadcast": @@ -179,7 +178,7 @@ def get_result(self): return self.apply_empty_result() # raw - elif self.raw and not self.obj._is_mixed_type: + elif self.raw: return self.apply_raw() return self.apply_standard() diff --git a/pandas/core/array_algos/__init__.py b/pandas/core/array_algos/__init__.py new file mode 100644 index 0000000000000..a7655a013c6cf --- /dev/null +++ b/pandas/core/array_algos/__init__.py @@ -0,0 +1,9 @@ +""" +core.array_algos is for algorithms that operate on ndarray and ExtensionArray. +These should: + +- Assume that any Index, Series, or DataFrame objects have already been unwrapped. +- Assume that any list arguments have already been cast to ndarray/EA. +- Not depend on Index, Series, or DataFrame, nor import any of these. +- May dispatch to ExtensionArray methods, but should not import from core.arrays. +""" diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py new file mode 100644 index 0000000000000..1b9ed014f27b7 --- /dev/null +++ b/pandas/core/array_algos/masked_reductions.py @@ -0,0 +1,102 @@ +""" +masked_reductions.py is for reduction algorithms using a mask-based approach +for missing values. +""" + +from typing import Callable + +import numpy as np + +from pandas._libs import missing as libmissing +from pandas.compat.numpy import _np_version_under1p17 + +from pandas.core.nanops import check_below_min_count + + +def _sumprod( + func: Callable, + values: np.ndarray, + mask: np.ndarray, + skipna: bool = True, + min_count: int = 0, +): + """ + Sum or product for 1D masked array. + + Parameters + ---------- + func : np.sum or np.prod + values : np.ndarray + Numpy array with the values (can be of any dtype that support the + operation). + mask : np.ndarray + Boolean numpy array (True values indicate missing values). + skipna : bool, default True + Whether to skip NA. + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer than + ``min_count`` non-NA values are present the result will be NA. + """ + if not skipna: + if mask.any() or check_below_min_count(values.shape, None, min_count): + return libmissing.NA + else: + return func(values) + else: + if check_below_min_count(values.shape, mask, min_count): + return libmissing.NA + + if _np_version_under1p17: + return func(values[~mask]) + else: + return func(values, where=~mask) + + +def sum(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0): + return _sumprod( + np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count + ) + + +def prod(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0): + return _sumprod( + np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count + ) + + +def _minmax(func: Callable, values: np.ndarray, mask: np.ndarray, skipna: bool = True): + """ + Reduction for 1D masked array. + + Parameters + ---------- + func : np.min or np.max + values : np.ndarray + Numpy array with the values (can be of any dtype that support the + operation). + mask : np.ndarray + Boolean numpy array (True values indicate missing values). + skipna : bool, default True + Whether to skip NA. + """ + if not skipna: + if mask.any() or not values.size: + # min/max with empty array raise in numpy, pandas returns NA + return libmissing.NA + else: + return func(values) + else: + subset = values[~mask] + if subset.size: + return func(subset) + else: + # min/max with empty array raise in numpy, pandas returns NA + return libmissing.NA + + +def min(values: np.ndarray, mask: np.ndarray, skipna: bool = True): + return _minmax(np.min, values=values, mask=mask, skipna=skipna) + + +def max(values: np.ndarray, mask: np.ndarray, skipna: bool = True): + return _minmax(np.max, values=values, mask=mask, skipna=skipna) diff --git a/pandas/core/array_algos/transforms.py b/pandas/core/array_algos/transforms.py new file mode 100644 index 0000000000000..f775b6d733d9c --- /dev/null +++ b/pandas/core/array_algos/transforms.py @@ -0,0 +1,33 @@ +""" +transforms.py is for shape-preserving functions. +""" + +import numpy as np + +from pandas.core.dtypes.common import ensure_platform_int + + +def shift(values: np.ndarray, periods: int, axis: int, fill_value) -> np.ndarray: + new_values = values + + # make sure array sent to np.roll is c_contiguous + f_ordered = values.flags.f_contiguous + if f_ordered: + new_values = new_values.T + axis = new_values.ndim - axis - 1 + + if np.prod(new_values.shape): + new_values = np.roll(new_values, ensure_platform_int(periods), axis=axis) + + axis_indexer = [slice(None)] * values.ndim + if periods > 0: + axis_indexer[axis] = slice(None, periods) + else: + axis_indexer[axis] = slice(periods, None) + new_values[tuple(axis_indexer)] = fill_value + + # restore original order + if f_ordered: + new_values = new_values.T + + return new_values diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index bf3469924a700..1d538824e6d82 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -2,7 +2,6 @@ ExtensionArray, ExtensionOpsMixin, ExtensionScalarOpsMixin, - try_cast_to_ea, ) from pandas.core.arrays.boolean import BooleanArray from pandas.core.arrays.categorical import Categorical @@ -19,7 +18,6 @@ "ExtensionArray", "ExtensionOpsMixin", "ExtensionScalarOpsMixin", - "try_cast_to_ea", "BooleanArray", "Categorical", "DatetimeArray", diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py index 20e4cf70eddcf..471bfa736d4b9 100644 --- a/pandas/core/arrays/_ranges.py +++ b/pandas/core/arrays/_ranges.py @@ -121,8 +121,8 @@ def _generate_range_overflow_safe( # we cannot salvage the operation by recursing, so raise try: addend = np.uint64(periods) * np.uint64(np.abs(stride)) - except FloatingPointError: - raise OutOfBoundsDatetime(msg) + except FloatingPointError as err: + raise OutOfBoundsDatetime(msg) from err if np.abs(addend) <= i64max: # relatively easy case without casting concerns diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index b5da6d4c11616..7447d593a7ff0 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -19,9 +19,10 @@ from pandas.util._decorators import Appender, Substitution from pandas.util._validators import validate_fillna_kwargs +from pandas.core.dtypes.cast import maybe_cast_to_extension_array from pandas.core.dtypes.common import is_array_like, is_list_like from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna from pandas.core import ops @@ -32,29 +33,6 @@ _extension_array_shared_docs: Dict[str, str] = dict() -def try_cast_to_ea(cls_or_instance, obj, dtype=None): - """ - Call to `_from_sequence` that returns the object unchanged on Exception. - - Parameters - ---------- - cls_or_instance : ExtensionArray subclass or instance - obj : arraylike - Values to pass to cls._from_sequence - dtype : ExtensionDtype, optional - - Returns - ------- - ExtensionArray or obj - """ - try: - result = cls_or_instance._from_sequence(obj, dtype=dtype) - except Exception: - # We can't predict what downstream EA constructors may raise - result = obj - return result - - class ExtensionArray: """ Abstract base class for custom 1-D array types. @@ -93,7 +71,6 @@ class ExtensionArray: _from_factorized _from_sequence _from_sequence_of_strings - _ndarray_values _reduce _values_for_argsort _values_for_factorize @@ -356,7 +333,9 @@ def __iter__(self): for i in range(len(self)): yield self[i] - def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default): + def to_numpy( + self, dtype=None, copy: bool = False, na_value=lib.no_default + ) -> np.ndarray: """ Convert to a NumPy ndarray. @@ -407,6 +386,13 @@ def shape(self) -> Tuple[int, ...]: """ return (len(self),) + @property + def size(self) -> int: + """ + The number of elements in the array. + """ + return np.prod(self.shape) + @property def ndim(self) -> int: """ @@ -583,7 +569,7 @@ def dropna(self): """ return self[~self.isna()] - def shift(self, periods: int = 1, fill_value: object = None) -> ABCExtensionArray: + def shift(self, periods: int = 1, fill_value: object = None) -> "ExtensionArray": """ Shift values by desired number. @@ -720,7 +706,7 @@ def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: """ return self.astype(object), np.nan - def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ABCExtensionArray]: + def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray"]: """ Encode the extension array as an enumerated type. @@ -825,7 +811,7 @@ def repeat(self, repeats, axis=None): def take( self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None - ) -> ABCExtensionArray: + ) -> "ExtensionArray": """ Take elements from an array. @@ -914,7 +900,7 @@ def take(self, indices, allow_fill=False, fill_value=None): # pandas.api.extensions.take raise AbstractMethodError(self) - def copy(self) -> ABCExtensionArray: + def copy(self) -> "ExtensionArray": """ Return a copy of the array. @@ -924,7 +910,7 @@ def copy(self) -> ABCExtensionArray: """ raise AbstractMethodError(self) - def view(self, dtype=None) -> Union[ABCExtensionArray, np.ndarray]: + def view(self, dtype=None) -> ArrayLike: """ Return a view on the array. @@ -935,8 +921,8 @@ def view(self, dtype=None) -> Union[ABCExtensionArray, np.ndarray]: Returns ------- - ExtensionArray - A view of the :class:`ExtensionArray`. + ExtensionArray or np.ndarray + A view on the :class:`ExtensionArray`'s data. """ # NB: # - This must return a *new* object referencing the same data, not self. @@ -994,7 +980,7 @@ def _formatter(self, boxed: bool = False) -> Callable[[Any], Optional[str]]: # Reshaping # ------------------------------------------------------------------------ - def ravel(self, order="C") -> ABCExtensionArray: + def ravel(self, order="C") -> "ExtensionArray": """ Return a flattened view on this array. @@ -1015,8 +1001,8 @@ def ravel(self, order="C") -> ABCExtensionArray: @classmethod def _concat_same_type( - cls, to_concat: Sequence[ABCExtensionArray] - ) -> ABCExtensionArray: + cls, to_concat: Sequence["ExtensionArray"] + ) -> "ExtensionArray": """ Concatenate multiple array. @@ -1037,22 +1023,6 @@ def _concat_same_type( # of objects _can_hold_na = True - @property - def _ndarray_values(self) -> np.ndarray: - """ - Internal pandas method for lossy conversion to a NumPy ndarray. - - This method is not part of the pandas interface. - - The expectation is that this is cheap to compute, and is primarily - used for interacting with our indexers. - - Returns - ------- - array : ndarray - """ - return np.array(self) - def _reduce(self, name, skipna=True, **kwargs): """ Return a scalar result of performing the reduction operation. @@ -1191,7 +1161,7 @@ def _create_method(cls, op, coerce_to_dtype=True): -------- Given an ExtensionArray subclass called MyExtensionArray, use - >>> __add__ = cls._create_method(operator.add) + __add__ = cls._create_method(operator.add) in the class definition of MyExtensionArray to create the operator for addition, that will be based on the operator implementation @@ -1222,7 +1192,7 @@ def _maybe_convert(arr): # https://github.com/pandas-dev/pandas/issues/22850 # We catch all regular exceptions here, and fall back # to an ndarray. - res = try_cast_to_ea(self, arr) + res = maybe_cast_to_extension_array(type(self), arr) if not isinstance(res, type(self)): # exception raised in _from_sequence; ensure we have ndarray res = np.asarray(arr) @@ -1236,7 +1206,7 @@ def _maybe_convert(arr): return _maybe_convert(res) - op_name = ops._get_op_name(op, True) + op_name = f"__{op.__name__}__" return set_function_name(_binop, op_name, cls) @classmethod diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index d93b5fbc83312..685a9ec48228f 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -24,9 +24,10 @@ ) from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries -from pandas.core.dtypes.missing import isna, notna +from pandas.core.dtypes.missing import isna from pandas.core import nanops, ops +from pandas.core.array_algos import masked_reductions from pandas.core.indexers import check_array_indexer from .masked import BaseMaskedArray @@ -270,18 +271,8 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): if not (isinstance(values, np.ndarray) and values.dtype == np.bool_): raise TypeError( "values should be boolean numpy array. Use " - "the 'array' function instead" + "the 'pd.array' function instead" ) - if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_): - raise TypeError( - "mask should be boolean numpy array. Use " - "the 'array' function instead" - ) - if not values.ndim == 1: - raise ValueError("values must be a 1D array") - if not mask.ndim == 1: - raise ValueError("mask must be a 1D array") - self._dtype = BooleanDtype() super().__init__(values, mask, copy=copy) @@ -519,7 +510,7 @@ def any(self, skipna: bool = True, **kwargs): if skipna: return result else: - if result or len(self) == 0: + if result or len(self) == 0 or not self._mask.any(): return result else: return self.dtype.na_value @@ -586,7 +577,7 @@ def all(self, skipna: bool = True, **kwargs): if skipna: return result else: - if not result or len(self) == 0: + if not result or len(self) == 0 or not self._mask.any(): return result else: return self.dtype.na_value @@ -695,6 +686,10 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): data = self._data mask = self._mask + if name in {"sum", "prod", "min", "max"}: + op = getattr(masked_reductions, name) + return op(data, mask, skipna=skipna, **kwargs) + # coerce to a nan-aware float if needed if self._hasna: data = self.to_numpy("float64", na_value=np.nan) @@ -705,15 +700,6 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): if np.isnan(result): return libmissing.NA - # if we have numeric op that would result in an int, coerce to int if possible - if name in ["sum", "prod"] and notna(result): - int_result = np.int64(result) - if int_result == result: - result = int_result - - elif name in ["min", "max"] and notna(result): - result = np.bool_(result) - return result def _maybe_mask_result(self, result, mask, other, op_name: str): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a5048e3aae899..c9b8db28e0cf6 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -15,10 +15,15 @@ Substitution, cache_readonly, deprecate_kwarg, + doc, ) from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs -from pandas.core.dtypes.cast import coerce_indexer_dtype, maybe_infer_to_datetimelike +from pandas.core.dtypes.cast import ( + coerce_indexer_dtype, + maybe_cast_to_extension_array, + maybe_infer_to_datetimelike, +) from pandas.core.dtypes.common import ( ensure_int64, ensure_object, @@ -46,11 +51,7 @@ from pandas.core.accessor import PandasDelegate, delegate_names import pandas.core.algorithms as algorithms from pandas.core.algorithms import _get_data_algo, factorize, take, take_1d, unique1d -from pandas.core.arrays.base import ( - ExtensionArray, - _extension_array_shared_docs, - try_cast_to_ea, -) +from pandas.core.arrays.base import ExtensionArray, _extension_array_shared_docs from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs import pandas.core.common as com from pandas.core.construction import array, extract_array, sanitize_array @@ -103,7 +104,10 @@ def func(self, other): mask = (self._codes == -1) | (other_codes == -1) if mask.any(): # In other series, the leads to False, so do that here too - ret[mask] = False + if opname == "__ne__": + ret[(self._codes == -1) & (other_codes == -1)] = True + else: + ret[mask] = False return ret if is_scalar(other): @@ -238,8 +242,6 @@ class Categorical(ExtensionArray, PandasObject): dtype : CategoricalDtype An instance of ``CategoricalDtype`` to use for this categorical. - .. versionadded:: 0.21.0 - Attributes ---------- categories : Index @@ -253,8 +255,6 @@ class Categorical(ExtensionArray, PandasObject): The instance of ``CategoricalDtype`` storing the ``categories`` and ``ordered``. - .. versionadded:: 0.21.0 - Methods ------- from_codes @@ -350,7 +350,7 @@ def __init__( if dtype.categories is None: try: codes, categories = factorize(values, sort=True) - except TypeError: + except TypeError as err: codes, categories = factorize(values, sort=False) if dtype.ordered: # raise, as we don't have a sortable data structure and so @@ -359,13 +359,13 @@ def __init__( "'values' is not ordered, please " "explicitly specify the categories order " "by passing in a categories argument." - ) - except ValueError: + ) from err + except ValueError as err: # FIXME raise NotImplementedError( "> 1 ndim Categorical are not supported at this time" - ) + ) from err # we're inferring from values dtype = CategoricalDtype(categories, dtype.ordered) @@ -374,7 +374,7 @@ def __init__( old_codes = ( values._values.codes if isinstance(values, ABCSeries) else values.codes ) - codes = _recode_for_categories( + codes = recode_for_categories( old_codes, values.dtype.categories, dtype.categories ) @@ -412,12 +412,12 @@ def categories(self): See Also -------- - rename_categories - reorder_categories - add_categories - remove_categories - remove_unused_categories - set_categories + rename_categories : Rename categories. + reorder_categories : Reorder categories. + add_categories : Add new categories. + remove_categories : Remove the specified categories. + remove_unused_categories : Remove categories which are not used. + set_categories : Set the categories to the specified ones. """ return self.dtype.categories @@ -447,10 +447,6 @@ def dtype(self) -> CategoricalDtype: """ return self._dtype - @property - def _ndarray_values(self) -> np.ndarray: - return self.codes - @property def _constructor(self) -> Type["Categorical"]: return Categorical @@ -572,13 +568,13 @@ def _from_inferred_categories( if known_categories: # Recode from observation order to dtype.categories order. categories = dtype.categories - codes = _recode_for_categories(inferred_codes, cats, categories) + codes = recode_for_categories(inferred_codes, cats, categories) elif not cats.is_monotonic_increasing: # Sort categories and recode for unknown categories. unsorted = cats.copy() categories = cats.sort_values() - codes = _recode_for_categories(inferred_codes, unsorted, categories) + codes = recode_for_categories(inferred_codes, unsorted, categories) dtype = CategoricalDtype(categories, ordered=False) else: dtype = CategoricalDtype(cats, ordered=False) @@ -727,7 +723,7 @@ def _set_dtype(self, dtype: CategoricalDtype) -> "Categorical": We don't do any validation here. It's assumed that the dtype is a (valid) instance of `CategoricalDtype`. """ - codes = _recode_for_categories(self.codes, self.categories, dtype.categories) + codes = recode_for_categories(self.codes, self.categories, dtype.categories) return type(self)(codes, dtype=dtype, fastpath=True) def set_ordered(self, value, inplace=False): @@ -830,11 +826,11 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal See Also -------- - rename_categories - reorder_categories - add_categories - remove_categories - remove_unused_categories + rename_categories : Rename categories. + reorder_categories : Reorder categories. + add_categories : Add new categories. + remove_categories : Remove the specified categories. + remove_unused_categories : Remove categories which are not used. """ inplace = validate_bool_kwarg(inplace, "inplace") if ordered is None: @@ -849,7 +845,7 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal # remove all _codes which are larger and set to -1/NaN cat._codes[cat._codes >= len(new_dtype.categories)] = -1 else: - codes = _recode_for_categories( + codes = recode_for_categories( cat.codes, cat.categories, new_dtype.categories ) cat._codes = codes @@ -876,8 +872,6 @@ def rename_categories(self, new_categories, inplace=False): are passed through and extra categories in the mapping are ignored. - .. versionadded:: 0.21.0. - * callable : a callable that is called on all items in the old categories and whose return values comprise the new categories. @@ -901,11 +895,11 @@ def rename_categories(self, new_categories, inplace=False): See Also -------- - reorder_categories - add_categories - remove_categories - remove_unused_categories - set_categories + reorder_categories : Reorder categories. + add_categories : Add new categories. + remove_categories : Remove the specified categories. + remove_unused_categories : Remove categories which are not used. + set_categories : Set the categories to the specified ones. Examples -------- @@ -969,11 +963,11 @@ def reorder_categories(self, new_categories, ordered=None, inplace=False): See Also -------- - rename_categories - add_categories - remove_categories - remove_unused_categories - set_categories + rename_categories : Rename categories. + add_categories : Add new categories. + remove_categories : Remove the specified categories. + remove_unused_categories : Remove categories which are not used. + set_categories : Set the categories to the specified ones. """ inplace = validate_bool_kwarg(inplace, "inplace") if set(self.dtype.categories) != set(new_categories): @@ -1009,11 +1003,11 @@ def add_categories(self, new_categories, inplace=False): See Also -------- - rename_categories - reorder_categories - remove_categories - remove_unused_categories - set_categories + rename_categories : Rename categories. + reorder_categories : Reorder categories. + remove_categories : Remove the specified categories. + remove_unused_categories : Remove categories which are not used. + set_categories : Set the categories to the specified ones. """ inplace = validate_bool_kwarg(inplace, "inplace") if not is_list_like(new_categories): @@ -1058,11 +1052,11 @@ def remove_categories(self, removals, inplace=False): See Also -------- - rename_categories - reorder_categories - add_categories - remove_unused_categories - set_categories + rename_categories : Rename categories. + reorder_categories : Reorder categories. + add_categories : Add new categories. + remove_unused_categories : Remove categories which are not used. + set_categories : Set the categories to the specified ones. """ inplace = validate_bool_kwarg(inplace, "inplace") if not is_list_like(removals): @@ -1100,11 +1094,11 @@ def remove_unused_categories(self, inplace=False): See Also -------- - rename_categories - reorder_categories - add_categories - remove_categories - set_categories + rename_categories : Rename categories. + reorder_categories : Reorder categories. + add_categories : Add new categories. + remove_categories : Remove the specified categories. + set_categories : Set the categories to the specified ones. """ inplace = validate_bool_kwarg(inplace, "inplace") cat = self if inplace else self.copy() @@ -1306,7 +1300,6 @@ def __setstate__(self, state): if not isinstance(state, dict): raise Exception("invalid pickle state") - # compat with pre 0.21.0 CategoricalDtype change if "_dtype" not in state: state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"]) @@ -1314,7 +1307,7 @@ def __setstate__(self, state): setattr(self, k, v) @property - def T(self): + def T(self) -> "Categorical": """ Return transposed numpy array. """ @@ -1349,8 +1342,7 @@ def memory_usage(self, deep=False): """ return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep) - @Substitution(klass="Categorical") - @Appender(_shared_docs["searchsorted"]) + @doc(_shared_docs["searchsorted"], klass="Categorical") def searchsorted(self, value, side="left", sorter=None): # searchsorted is very performance sensitive. By converting codes # to same dtype as self.codes, we get much faster performance. @@ -1406,12 +1398,6 @@ def notna(self): notnull = notna - def put(self, *args, **kwargs): - """ - Replace specific elements in the Categorical with given values. - """ - raise NotImplementedError(("'put' is not yet implemented for Categorical")) - def dropna(self): """ Return the Categorical without null values. @@ -1491,7 +1477,7 @@ def check_for_ordered(self, op): ) def _values_for_argsort(self): - return self._codes.copy() + return self._codes def argsort(self, ascending=True, kind="quicksort", **kwargs): """ @@ -1605,19 +1591,19 @@ def sort_values(self, inplace=False, ascending=True, na_position="last"): >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5]) >>> c - [NaN, 2.0, 2.0, NaN, 5.0] + [NaN, 2, 2, NaN, 5] Categories (2, int64): [2, 5] >>> c.sort_values() - [2.0, 2.0, 5.0, NaN, NaN] + [2, 2, 5, NaN, NaN] Categories (2, int64): [2, 5] >>> c.sort_values(ascending=False) - [5.0, 2.0, 2.0, NaN, NaN] + [5, 2, 2, NaN, NaN] Categories (2, int64): [2, 5] >>> c.sort_values(na_position='first') - [NaN, NaN, 2.0, 2.0, 5.0] + [NaN, NaN, 2, 2, 5] Categories (2, int64): [2, 5] >>> c.sort_values(ascending=False, na_position='first') - [NaN, NaN, 5.0, 2.0, 2.0] + [NaN, NaN, 5, 2, 2] Categories (2, int64): [2, 5] """ inplace = validate_bool_kwarg(inplace, "inplace") @@ -1678,6 +1664,12 @@ def to_dense(self): ------- dense : array """ + warn( + "Categorical.to_dense is deprecated and will be removed in " + "a future version. Use np.asarray(cat) instead.", + FutureWarning, + stacklevel=2, + ) return np.asarray(self) def fillna(self, value=None, method=None, limit=None): @@ -1725,7 +1717,8 @@ def fillna(self, value=None, method=None, limit=None): # pad / bfill if method is not None: - values = self.to_dense().reshape(-1, len(self)) + # TODO: dispatch when self.categories is EA-dtype + values = np.asarray(self).reshape(-1, len(self)) values = interpolate_2d(values, method, 0, None, value).astype( self.categories.dtype )[0] @@ -1735,12 +1728,17 @@ def fillna(self, value=None, method=None, limit=None): # If value is a dict or a Series (a dict value has already # been converted to a Series) - if isinstance(value, ABCSeries): - if not value[~value.isin(self.categories)].isna().all(): + if isinstance(value, (np.ndarray, Categorical, ABCSeries)): + # We get ndarray or Categorical if called via Series.fillna, + # where it will unwrap another aligned Series before getting here + + mask = ~algorithms.isin(value, self.categories) + if not isna(value[mask]).all(): raise ValueError("fill value must be in categories") values_codes = _get_codes_for_values(value, self.categories) indexer = np.where(codes == -1) + codes = codes.copy() codes[indexer] = values_codes[indexer] # If value is not a dict or Series it should be a scalar @@ -1830,7 +1828,7 @@ def take(self, indexer, allow_fill: bool = False, fill_value=None): >>> cat.take([0, -1, -1], allow_fill=True, fill_value='a') [a, a, a] - Categories (3, object): [a, b] + Categories (2, object): [a, b] Specifying a fill value that's not in ``self.categories`` will raise a ``TypeError``. @@ -2029,7 +2027,7 @@ def __setitem__(self, key, value): "without identical categories" ) if not self.categories.equals(value.categories): - new_codes = _recode_for_categories( + new_codes = recode_for_categories( value.codes, value.categories, self.categories ) value = Categorical.from_codes(new_codes, dtype=self.dtype) @@ -2226,33 +2224,32 @@ def unique(self): ------- unique values : ``Categorical`` + See Also + -------- + pandas.unique + CategoricalIndex.unique + Series.unique + Examples -------- An unordered Categorical will return categories in the order of appearance. - >>> pd.Categorical(list('baabc')) + >>> pd.Categorical(list("baabc")).unique() [b, a, c] Categories (3, object): [b, a, c] - >>> pd.Categorical(list('baabc'), categories=list('abc')) + >>> pd.Categorical(list("baabc"), categories=list("abc")).unique() [b, a, c] Categories (3, object): [b, a, c] An ordered Categorical preserves the category ordering. - >>> pd.Categorical(list('baabc'), - ... categories=list('abc'), - ... ordered=True) + >>> pd.Categorical( + ... list("baabc"), categories=list("abc"), ordered=True + ... ).unique() [b, a, c] Categories (3, object): [a < b < c] - - See Also - -------- - unique - CategoricalIndex.unique - Series.unique - """ # unlike np.unique, unique1d does not sort unique_codes = unique1d(self.codes) @@ -2294,7 +2291,7 @@ def equals(self, other): # fastpath to avoid re-coding other_codes = other._codes else: - other_codes = _recode_for_categories( + other_codes = recode_for_categories( other.codes, other.categories, self.categories ) return np.array_equal(self._codes, other_codes) @@ -2433,7 +2430,7 @@ def replace(self, to_replace, value, inplace: bool = False): -------- >>> s = pd.Categorical([1, 2, 1, 3]) >>> s.replace(1, 3) - [3, 3, 2, 3] + [3, 2, 3, 3] Categories (2, int64): [2, 3] """ inplace = validate_bool_kwarg(inplace, "inplace") @@ -2450,6 +2447,8 @@ def replace(self, to_replace, value, inplace: bool = False): # other cases, like if both to_replace and value are list-like or if # to_replace is a dict, are handled separately in NDFrame for replace_value, new_value in replace_dict.items(): + if new_value == replace_value: + continue if replace_value in cat.categories: if isna(new_value): cat.remove_categories(replace_value, inplace=True) @@ -2501,16 +2500,100 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): Examples -------- + >>> s = pd.Series(list("abbccc")).astype("category") + >>> s + 0 a + 1 b + 2 b + 3 c + 4 c + 5 c + dtype: category + Categories (3, object): [a, b, c] + >>> s.cat.categories - >>> s.cat.categories = list('abc') - >>> s.cat.rename_categories(list('cab')) - >>> s.cat.reorder_categories(list('cab')) - >>> s.cat.add_categories(['d','e']) - >>> s.cat.remove_categories(['d']) - >>> s.cat.remove_unused_categories() - >>> s.cat.set_categories(list('abcde')) + Index(['a', 'b', 'c'], dtype='object') + + >>> s.cat.rename_categories(list("cba")) + 0 c + 1 b + 2 b + 3 a + 4 a + 5 a + dtype: category + Categories (3, object): [c, b, a] + + >>> s.cat.reorder_categories(list("cba")) + 0 a + 1 b + 2 b + 3 c + 4 c + 5 c + dtype: category + Categories (3, object): [c, b, a] + + >>> s.cat.add_categories(["d", "e"]) + 0 a + 1 b + 2 b + 3 c + 4 c + 5 c + dtype: category + Categories (5, object): [a, b, c, d, e] + + >>> s.cat.remove_categories(["a", "c"]) + 0 NaN + 1 b + 2 b + 3 NaN + 4 NaN + 5 NaN + dtype: category + Categories (1, object): [b] + + >>> s1 = s.cat.add_categories(["d", "e"]) + >>> s1.cat.remove_unused_categories() + 0 a + 1 b + 2 b + 3 c + 4 c + 5 c + dtype: category + Categories (3, object): [a, b, c] + + >>> s.cat.set_categories(list("abcde")) + 0 a + 1 b + 2 b + 3 c + 4 c + 5 c + dtype: category + Categories (5, object): [a, b, c, d, e] + >>> s.cat.as_ordered() + 0 a + 1 b + 2 b + 3 c + 4 c + 5 c + dtype: category + Categories (3, object): [a < b < c] + >>> s.cat.as_unordered() + 0 a + 1 b + 2 b + 3 c + 4 c + 5 c + dtype: category + Categories (3, object): [a, b, c] """ def __init__(self, data): @@ -2558,22 +2641,17 @@ def _get_codes_for_values(values, categories): """ dtype_equal = is_dtype_equal(values.dtype, categories.dtype) - if dtype_equal: - # To prevent erroneous dtype coercion in _get_data_algo, retrieve - # the underlying numpy array. gh-22702 - values = getattr(values, "_ndarray_values", values) - categories = getattr(categories, "_ndarray_values", categories) - elif is_extension_array_dtype(categories.dtype) and is_object_dtype(values): + if is_extension_array_dtype(categories.dtype) and is_object_dtype(values): # Support inferring the correct extension dtype from an array of # scalar objects. e.g. # Categorical(array[Period, Period], categories=PeriodIndex(...)) cls = categories.dtype.construct_array_type() - values = try_cast_to_ea(cls, values) + values = maybe_cast_to_extension_array(cls, values) if not isinstance(values, cls): # exception raised in _from_sequence values = ensure_object(values) categories = ensure_object(categories) - else: + elif not dtype_equal: values = ensure_object(values) categories = ensure_object(categories) @@ -2584,7 +2662,7 @@ def _get_codes_for_values(values, categories): return coerce_indexer_dtype(t.lookup(vals), cats) -def _recode_for_categories(codes: np.ndarray, old_categories, new_categories): +def recode_for_categories(codes: np.ndarray, old_categories, new_categories): """ Convert a set of codes for to a new set of categories @@ -2602,8 +2680,8 @@ def _recode_for_categories(codes: np.ndarray, old_categories, new_categories): >>> old_cat = pd.Index(['b', 'a', 'c']) >>> new_cat = pd.Index(['a', 'b']) >>> codes = np.array([0, 1, 1, 2]) - >>> _recode_for_categories(codes, old_cat, new_cat) - array([ 1, 0, 0, -1]) + >>> recode_for_categories(codes, old_cat, new_cat) + array([ 1, 0, 0, -1], dtype=int8) """ if len(old_categories) == 0: # All null anyway, so just retain the nulls diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index f637e16caa4c6..30a34282889f8 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -40,8 +40,10 @@ from pandas.core import missing, nanops, ops from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts +from pandas.core.array_algos.transforms import shift from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin import pandas.core.common as com +from pandas.core.construction import array, extract_array from pandas.core.indexers import check_array_indexer from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.invalid import invalid_comparison, make_invalid_op @@ -129,7 +131,7 @@ class AttributesMixin: _data: np.ndarray @classmethod - def _simple_new(cls, values, **kwargs): + def _simple_new(cls, values: np.ndarray, **kwargs): raise AbstractMethodError(cls) @property @@ -179,7 +181,7 @@ def _unbox_scalar(self, value: Union[Period, Timestamp, Timedelta, NaTType]) -> Examples -------- - >>> self._unbox_scalar(Timedelta('10s')) # DOCTEST: +SKIP + >>> self._unbox_scalar(Timedelta("10s")) # doctest: +SKIP 10000000000 """ raise AbstractMethodError(self) @@ -200,7 +202,7 @@ def _check_compatible_with( ---------- other setitem : bool, default False - For __setitem__ we may have stricter compatiblity resrictions than + For __setitem__ we may have stricter compatibility resrictions than for comparisons. Raises @@ -394,6 +396,34 @@ def floor(self, freq, ambiguous="raise", nonexistent="raise"): def ceil(self, freq, ambiguous="raise", nonexistent="raise"): return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) + def _with_freq(self, freq): + """ + Helper to set our freq in-place, returning self to allow method chaining. + + Parameters + ---------- + freq : DateOffset, None, or "infer" + + Returns + ------- + self + """ + # GH#29843 + if freq is None: + # Always valid + pass + elif len(self) == 0 and isinstance(freq, DateOffset): + # Always valid. In the TimedeltaArray case, we assume this + # is a Tick offset. + pass + else: + # As an internal method, we can ensure this assertion always holds + assert freq == "infer" + freq = frequencies.to_offset(self.inferred_freq) + + self._freq = freq + return self + class DatetimeLikeArrayMixin(ExtensionOpsMixin, AttributesMixin, ExtensionArray): """ @@ -454,10 +484,6 @@ def asi8(self) -> np.ndarray: # do not cache or you'll create a memory leak return self._data.view("i8") - @property - def _ndarray_values(self): - return self._data - # ---------------------------------------------------------------- # Rendering Methods @@ -524,10 +550,7 @@ def __getitem__(self, key): key = np.asarray(key, dtype=bool) key = check_array_indexer(self, key) - if key.all(): - key = slice(0, None, None) - else: - key = lib.maybe_booleans_to_slice(key.view(np.uint8)) + key = lib.maybe_booleans_to_slice(key.view(np.uint8)) elif isinstance(key, list) and len(key) == 1 and isinstance(key[0], slice): # see https://github.com/pandas-dev/pandas/issues/31299, need to allow # this for now (would otherwise raise in check_array_indexer) @@ -535,7 +558,7 @@ def __getitem__(self, key): else: key = check_array_indexer(self, key) - is_period = is_period_dtype(self) + is_period = is_period_dtype(self.dtype) if is_period: freq = self.freq else: @@ -551,11 +574,8 @@ def __getitem__(self, key): freq = self.freq result = getitem(key) - if result.ndim > 1: - # To support MPL which performs slicing with 2 dim - # even though it only has 1 dim by definition - return result - + if lib.is_scalar(result): + return self._box_func(result) return self._simple_new(result, dtype=self.dtype, freq=freq) def __setitem__( @@ -623,7 +643,7 @@ def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if is_object_dtype(dtype): - return self._box_values(self.asi8) + return self._box_values(self.asi8.ravel()).reshape(self.shape) elif is_string_dtype(dtype) and not is_categorical_dtype(dtype): return self._format_native_types() elif is_integer_dtype(dtype): @@ -744,6 +764,38 @@ def _from_factorized(cls, values, original): def _values_for_argsort(self): return self._data + @Appender(ExtensionArray.shift.__doc__) + def shift(self, periods=1, fill_value=None, axis=0): + if not self.size or periods == 0: + return self.copy() + + if is_valid_nat_for_dtype(fill_value, self.dtype): + fill_value = NaT + elif not isinstance(fill_value, self._recognized_scalars): + # only warn if we're not going to raise + if self._scalar_type is Period and lib.is_integer(fill_value): + # kludge for #31971 since Period(integer) tries to cast to str + new_fill = Period._from_ordinal(fill_value, freq=self.freq) + else: + new_fill = self._scalar_type(fill_value) + + # stacklevel here is chosen to be correct when called from + # DataFrame.shift or Series.shift + warnings.warn( + f"Passing {type(fill_value)} to shift is deprecated and " + "will raise in a future version, pass " + f"{self._scalar_type.__name__} instead.", + FutureWarning, + stacklevel=9, + ) + fill_value = new_fill + + fill_value = self._unbox_scalar(fill_value) + + new_values = shift(self._data, periods, axis, fill_value) + + return type(self)._simple_new(new_values, dtype=self.dtype) + # ------------------------------------------------------------------ # Additional array methods # These are not part of the EA API, but we implement them because @@ -788,14 +840,14 @@ def searchsorted(self, value, side="left", sorter=None): elif isinstance(value, self._recognized_scalars): value = self._scalar_type(value) - elif isinstance(value, np.ndarray): + elif is_list_like(value) and not isinstance(value, type(self)): + value = array(value) + if not type(self)._is_recognized_dtype(value): raise TypeError( "searchsorted requires compatible dtype or scalar, " f"not {type(value).__name__}" ) - value = type(self)(value) - self._check_compatible_with(value) if not (isinstance(value, (self._scalar_type, type(self))) or (value is NaT)): raise TypeError(f"Unexpected type for 'value': {type(value)}") @@ -847,7 +899,7 @@ def value_counts(self, dropna=False): index = Index( cls(result.index.view("i8"), dtype=self.dtype), name=result.index.name ) - return Series(result.values, index=index, name=result.name) + return Series(result._values, index=index, name=result.name) def map(self, mapper): # TODO(GH-23179): Add ExtensionArray.map @@ -1099,56 +1151,46 @@ def _sub_period(self, other): def _add_offset(self, offset): raise AbstractMethodError(self) - def _add_delta(self, other): + def _add_timedeltalike_scalar(self, other): """ - Add a timedelta-like, Tick or TimedeltaIndex-like object - to self, yielding an int64 numpy array - - Parameters - ---------- - delta : {timedelta, np.timedelta64, Tick, - TimedeltaIndex, ndarray[timedelta64]} + Add a delta of a timedeltalike Returns ------- - result : ndarray[int64] - - Notes - ----- - The result's name is set outside of _add_delta by the calling - method (__add__ or __sub__), if necessary (i.e. for Indexes). - """ - if isinstance(other, (Tick, timedelta, np.timedelta64)): - new_values = self._add_timedeltalike_scalar(other) - elif is_timedelta64_dtype(other): - # ndarray[timedelta64] or TimedeltaArray/index - new_values = self._add_delta_tdi(other) - - return new_values - - def _add_timedeltalike_scalar(self, other): - """ - Add a delta of a timedeltalike - return the i8 result view + Same type as self """ if isna(other): # i.e np.timedelta64("NaT"), not recognized by delta_to_nanoseconds new_values = np.empty(self.shape, dtype="i8") new_values[:] = iNaT - return new_values + return type(self)(new_values, dtype=self.dtype) inc = delta_to_nanoseconds(other) new_values = checked_add_with_arr(self.asi8, inc, arr_mask=self._isnan).view( "i8" ) new_values = self._maybe_mask_results(new_values) - return new_values.view("i8") - def _add_delta_tdi(self, other): + new_freq = None + if isinstance(self.freq, Tick) or is_period_dtype(self.dtype): + # adding a scalar preserves freq + new_freq = self.freq + + if new_freq is not None: + # fastpath that doesnt require inference + return type(self)(new_values, dtype=self.dtype, freq=new_freq) + return type(self)(new_values, dtype=self.dtype)._with_freq("infer") + + def _add_timedelta_arraylike(self, other): """ Add a delta of a TimedeltaIndex - return the i8 result view + + Returns + ------- + Same type as self """ + # overridden by PeriodArray + if len(self) != len(other): raise ValueError("cannot add indices of unequal length") @@ -1166,7 +1208,8 @@ def _add_delta_tdi(self, other): if self._hasnans or other._hasnans: mask = (self._isnan) | (other._isnan) new_values[mask] = iNaT - return new_values.view("i8") + + return type(self)(new_values, dtype=self.dtype)._with_freq("infer") def _add_nat(self): """ @@ -1256,19 +1299,13 @@ def _addsub_object_array(self, other: np.ndarray, op): PerformanceWarning, ) - # For EA self.astype('O') returns a numpy array, not an Index - left = self.astype("O") + # Caller is responsible for broadcasting if necessary + assert self.shape == other.shape, (self.shape, other.shape) - res_values = op(left, np.array(other)) - kwargs = {} - if not is_period_dtype(self): - kwargs["freq"] = "infer" - try: - res = type(self)._from_sequence(res_values, **kwargs) - except ValueError: - # e.g. we've passed a Timestamp to TimedeltaArray - res = res_values - return res + res_values = op(self.astype("O"), np.array(other)) + result = array(res_values.ravel()) + result = extract_array(result, extract_numpy=True).reshape(self.shape) + return result def _time_shift(self, periods, freq=None): """ @@ -1314,7 +1351,7 @@ def __add__(self, other): if other is NaT: result = self._add_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): - result = self._add_delta(other) + result = self._add_timedeltalike_scalar(other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(other) @@ -1330,7 +1367,7 @@ def __add__(self, other): # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] - result = self._add_delta(other) + result = self._add_timedelta_arraylike(other) elif is_object_dtype(other): # e.g. Array/Index of DateOffset objects result = self._addsub_object_array(other, operator.add) @@ -1366,7 +1403,7 @@ def __sub__(self, other): if other is NaT: result = self._sub_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): - result = self._add_delta(-other) + result = self._add_timedeltalike_scalar(-other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(-other) @@ -1385,7 +1422,7 @@ def __sub__(self, other): # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] - result = self._add_delta(-other) + result = self._add_timedelta_arraylike(-other) elif is_object_dtype(other): # e.g. Array/Index of DateOffset objects result = self._addsub_object_array(other, operator.sub) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index a75536e46e60d..2ccc0ff2fa31d 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -18,11 +18,13 @@ timezones, tzconversion, ) +import pandas._libs.tslibs.frequencies as libfrequencies from pandas.errors import PerformanceWarning from pandas.core.dtypes.common import ( _INT64_DTYPE, - _NS_DTYPE, + DT64NS_DTYPE, + is_bool_dtype, is_categorical_dtype, is_datetime64_any_dtype, is_datetime64_dtype, @@ -65,7 +67,7 @@ def tz_to_dtype(tz): np.dtype or Datetime64TZDType """ if tz is None: - return _NS_DTYPE + return DT64NS_DTYPE else: return DatetimeTZDtype(tz=tz) @@ -181,7 +183,7 @@ class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps "microsecond", "nanosecond", ] - _other_ops = ["date", "time", "timetz"] + _other_ops = ["date", "time", "timetz", "isocalendar"] _datetimelike_ops = _field_ops + _object_ops + _bool_ops + _other_ops _datetimelike_methods = [ "to_period", @@ -208,7 +210,7 @@ class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps _dtype: Union[np.dtype, DatetimeTZDtype] _freq = None - def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): + def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy=False): if isinstance(values, (ABCSeries, ABCIndexClass)): values = values._values @@ -245,9 +247,9 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): # for compat with datetime/timedelta/period shared methods, # we can sometimes get here with int64 values. These represent # nanosecond UTC (or tz-naive) unix timestamps - values = values.view(_NS_DTYPE) + values = values.view(DT64NS_DTYPE) - if values.dtype != _NS_DTYPE: + if values.dtype != DT64NS_DTYPE: raise ValueError( "The dtype of 'values' is incorrect. Must be 'datetime64[ns]'. " f"Got {values.dtype} instead." @@ -281,11 +283,11 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): type(self)._validate_frequency(self, freq) @classmethod - def _simple_new(cls, values, freq=None, dtype=_NS_DTYPE): + def _simple_new(cls, values, freq=None, dtype=DT64NS_DTYPE): assert isinstance(values, np.ndarray) - if values.dtype != _NS_DTYPE: + if values.dtype != DT64NS_DTYPE: assert values.dtype == "i8" - values = values.view(_NS_DTYPE) + values = values.view(DT64NS_DTYPE) result = object.__new__(cls) result._data = values @@ -587,6 +589,8 @@ def astype(self, dtype, copy=True): if getattr(self.dtype, "tz", None) is None: return self.tz_localize(new_tz) result = self.tz_convert(new_tz) + if copy: + result = result.copy() if new_tz is None: # Do we want .astype('datetime64[ns]') to be an ndarray. # The astype in Block._astype expects this to return an @@ -610,8 +614,8 @@ def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): fmt = _get_format_datetime64_from_values(self, date_format) return tslib.format_array_from_datetime( - self.asi8, tz=self.tz, format=fmt, na_rep=na_rep - ) + self.asi8.ravel(), tz=self.tz, format=fmt, na_rep=na_rep + ).reshape(self.shape) # ----------------------------------------------------------------- # Comparison Methods @@ -694,7 +698,7 @@ def _add_offset(self, offset): # GH#30336 _from_sequence won't be able to infer self.tz return type(self)._from_sequence(result).tz_localize(self.tz) - return type(self)._from_sequence(result, freq="infer") + return type(self)._from_sequence(result)._with_freq("infer") def _sub_datetimelike_scalar(self, other): # subtract a datetime from myself, yielding a ndarray[timedelta64[ns]] @@ -715,23 +719,6 @@ def _sub_datetimelike_scalar(self, other): result = self._maybe_mask_results(result) return result.view("timedelta64[ns]") - def _add_delta(self, delta): - """ - Add a timedelta-like, Tick, or TimedeltaIndex-like object - to self, yielding a new DatetimeArray - - Parameters - ---------- - other : {timedelta, np.timedelta64, Tick, - TimedeltaIndex, ndarray[timedelta64]} - - Returns - ------- - result : DatetimeArray - """ - new_values = super()._add_delta(delta) - return type(self)._from_sequence(new_values, tz=self.tz, freq="infer") - # ----------------------------------------------------------------- # Timezone Conversion and Localization Methods @@ -936,9 +923,10 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise"): ... '2018-10-28 02:36:00', ... '2018-10-28 03:46:00'])) >>> s.dt.tz_localize('CET', ambiguous=np.array([True, True, False])) - 0 2015-03-29 03:00:00+02:00 - 1 2015-03-29 03:30:00+02:00 - dtype: datetime64[ns, Europe/Warsaw] + 0 2018-10-28 01:20:00+02:00 + 1 2018-10-28 02:36:00+02:00 + 2 2018-10-28 03:46:00+01:00 + dtype: datetime64[ns, CET] If the DST transition causes nonexistent times, you can shift these dates forward or backwards with a timedelta object or `'shift_forward'` @@ -949,15 +937,17 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise"): >>> s.dt.tz_localize('Europe/Warsaw', nonexistent='shift_forward') 0 2015-03-29 03:00:00+02:00 1 2015-03-29 03:30:00+02:00 - dtype: datetime64[ns, 'Europe/Warsaw'] + dtype: datetime64[ns, Europe/Warsaw] + >>> s.dt.tz_localize('Europe/Warsaw', nonexistent='shift_backward') 0 2015-03-29 01:59:59.999999999+01:00 1 2015-03-29 03:30:00+02:00 - dtype: datetime64[ns, 'Europe/Warsaw'] + dtype: datetime64[ns, Europe/Warsaw] + >>> s.dt.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1H')) 0 2015-03-29 03:30:00+02:00 1 2015-03-29 03:30:00+02:00 - dtype: datetime64[ns, 'Europe/Warsaw'] + dtype: datetime64[ns, Europe/Warsaw] """ nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward") if nonexistent not in nonexistent_options and not isinstance( @@ -981,14 +971,14 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise"): new_dates = conversion.tz_localize_to_utc( self.asi8, tz, ambiguous=ambiguous, nonexistent=nonexistent ) - new_dates = new_dates.view(_NS_DTYPE) + new_dates = new_dates.view(DT64NS_DTYPE) dtype = tz_to_dtype(tz) return self._simple_new(new_dates, dtype=dtype, freq=self.freq) # ---------------------------------------------------------------- # Conversion Methods - Vectorized analogues of Timestamp methods - def to_pydatetime(self): + def to_pydatetime(self) -> np.ndarray: """ Return Datetime Array/Index as object ndarray of datetime.datetime objects. @@ -1045,7 +1035,7 @@ def normalize(self): new_values[not_null] = new_values[not_null] - adjustment else: new_values = conversion.normalize_i8_timestamps(self.asi8, self.tz) - return type(self)._from_sequence(new_values, freq="infer").tz_localize(self.tz) + return type(self)(new_values)._with_freq("infer").tz_localize(self.tz) def to_period(self, freq=None): """ @@ -1108,7 +1098,14 @@ def to_period(self, freq=None): "You must pass a freq argument as current index has none." ) - freq = get_period_alias(freq) + res = get_period_alias(freq) + + # https://github.com/pandas-dev/pandas/issues/33358 + if res is None: + base, stride = libfrequencies._base_and_stride(freq) + res = f"{stride}{base}" + + freq = res return PeriodArray._from_datetime64(self._data, freq, tz=self.tz) @@ -1245,11 +1242,71 @@ def date(self): return tslib.ints_to_pydatetime(timestamps, box="date") + @property + def isocalendar(self): + """ + Returns a DataFrame with the year, week, and day calculated according to + the ISO 8601 standard. + + .. versionadded:: 1.1.0 + + Returns + ------- + DataFrame + with columns year, week and day + + See Also + -------- + Timestamp.isocalendar + datetime.date.isocalendar + + Examples + -------- + >>> idx = pd.date_range(start='2019-12-29', freq='D', periods=4) + >>> idx.isocalendar + year week day + 0 2019 52 7 + 1 2020 1 1 + 2 2020 1 2 + 3 2020 1 3 + >>> idx.isocalendar.week + 0 52 + 1 1 + 2 1 + 3 1 + Name: week, dtype: UInt32 + """ + from pandas import DataFrame + + sarray = fields.build_isocalendar_sarray(self.asi8) + iso_calendar_df = DataFrame( + sarray, columns=["year", "week", "day"], dtype="UInt32" + ) + if self._hasnans: + iso_calendar_df.iloc[self._isnan] = None + return iso_calendar_df + year = _field_accessor( "year", "Y", """ The year of the datetime. + + Examples + -------- + >>> datetime_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="Y") + ... ) + >>> datetime_series + 0 2000-12-31 + 1 2001-12-31 + 2 2002-12-31 + dtype: datetime64[ns] + >>> datetime_series.dt.year + 0 2000 + 1 2001 + 2 2002 + dtype: int64 """, ) month = _field_accessor( @@ -1257,6 +1314,22 @@ def date(self): "M", """ The month as January=1, December=12. + + Examples + -------- + >>> datetime_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="M") + ... ) + >>> datetime_series + 0 2000-01-31 + 1 2000-02-29 + 2 2000-03-31 + dtype: datetime64[ns] + >>> datetime_series.dt.month + 0 1 + 1 2 + 2 3 + dtype: int64 """, ) day = _field_accessor( @@ -1264,6 +1337,22 @@ def date(self): "D", """ The day of the datetime. + + Examples + -------- + >>> datetime_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="D") + ... ) + >>> datetime_series + 0 2000-01-01 + 1 2000-01-02 + 2 2000-01-03 + dtype: datetime64[ns] + >>> datetime_series.dt.day + 0 1 + 1 2 + 2 3 + dtype: int64 """, ) hour = _field_accessor( @@ -1271,6 +1360,22 @@ def date(self): "h", """ The hours of the datetime. + + Examples + -------- + >>> datetime_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="h") + ... ) + >>> datetime_series + 0 2000-01-01 00:00:00 + 1 2000-01-01 01:00:00 + 2 2000-01-01 02:00:00 + dtype: datetime64[ns] + >>> datetime_series.dt.hour + 0 0 + 1 1 + 2 2 + dtype: int64 """, ) minute = _field_accessor( @@ -1278,6 +1383,22 @@ def date(self): "m", """ The minutes of the datetime. + + Examples + -------- + >>> datetime_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="T") + ... ) + >>> datetime_series + 0 2000-01-01 00:00:00 + 1 2000-01-01 00:01:00 + 2 2000-01-01 00:02:00 + dtype: datetime64[ns] + >>> datetime_series.dt.minute + 0 0 + 1 1 + 2 2 + dtype: int64 """, ) second = _field_accessor( @@ -1285,6 +1406,22 @@ def date(self): "s", """ The seconds of the datetime. + + Examples + -------- + >>> datetime_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="s") + ... ) + >>> datetime_series + 0 2000-01-01 00:00:00 + 1 2000-01-01 00:00:01 + 2 2000-01-01 00:00:02 + dtype: datetime64[ns] + >>> datetime_series.dt.second + 0 0 + 1 1 + 2 2 + dtype: int64 """, ) microsecond = _field_accessor( @@ -1292,6 +1429,22 @@ def date(self): "us", """ The microseconds of the datetime. + + Examples + -------- + >>> datetime_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="us") + ... ) + >>> datetime_series + 0 2000-01-01 00:00:00.000000 + 1 2000-01-01 00:00:00.000001 + 2 2000-01-01 00:00:00.000002 + dtype: datetime64[ns] + >>> datetime_series.dt.microsecond + 0 0 + 1 1 + 2 2 + dtype: int64 """, ) nanosecond = _field_accessor( @@ -1299,6 +1452,22 @@ def date(self): "ns", """ The nanoseconds of the datetime. + + Examples + -------- + >>> datetime_series = pd.Series( + ... pd.date_range("2000-01-01", periods=3, freq="ns") + ... ) + >>> datetime_series + 0 2000-01-01 00:00:00.000000000 + 1 2000-01-01 00:00:00.000000001 + 2 2000-01-01 00:00:00.000000002 + dtype: datetime64[ns] + >>> datetime_series.dt.nanosecond + 0 0 + 1 1 + 2 2 + dtype: int64 """, ) weekofyear = _field_accessor( @@ -1618,9 +1787,9 @@ def date(self): DatetimeIndex(['2012-12-31', '2013-12-31', '2014-12-31'], dtype='datetime64[ns]', freq='A-DEC') >>> idx.is_leap_year - array([ True, False, False], dtype=bool) + array([ True, False, False]) - >>> dates = pd.Series(idx) + >>> dates_series = pd.Series(idx) >>> dates_series 0 2012-12-31 1 2013-12-31 @@ -1762,7 +1931,7 @@ def sequence_to_dt64ns( elif is_datetime64_dtype(data): # tz-naive DatetimeArray or ndarray[datetime64] data = getattr(data, "_data", data) - if data.dtype != _NS_DTYPE: + if data.dtype != DT64NS_DTYPE: data = conversion.ensure_datetime64ns(data) if tz is not None: @@ -1771,9 +1940,9 @@ def sequence_to_dt64ns( data = conversion.tz_localize_to_utc( data.view("i8"), tz, ambiguous=ambiguous ) - data = data.view(_NS_DTYPE) + data = data.view(DT64NS_DTYPE) - assert data.dtype == _NS_DTYPE, data.dtype + assert data.dtype == DT64NS_DTYPE, data.dtype result = data else: @@ -1784,7 +1953,7 @@ def sequence_to_dt64ns( if data.dtype != _INT64_DTYPE: data = data.astype(np.int64, copy=False) - result = data.view(_NS_DTYPE) + result = data.view(DT64NS_DTYPE) if copy: # TODO: should this be deepcopy? @@ -1901,32 +2070,36 @@ def maybe_convert_dtype(data, copy): ------ TypeError : PeriodDType data is passed """ - if is_float_dtype(data): + if not hasattr(data, "dtype"): + # e.g. collections.deque + return data, copy + + if is_float_dtype(data.dtype): # Note: we must cast to datetime64[ns] here in order to treat these # as wall-times instead of UTC timestamps. - data = data.astype(_NS_DTYPE) + data = data.astype(DT64NS_DTYPE) copy = False # TODO: deprecate this behavior to instead treat symmetrically # with integer dtypes. See discussion in GH#23675 - elif is_timedelta64_dtype(data): + elif is_timedelta64_dtype(data.dtype) or is_bool_dtype(data.dtype): # GH#29794 enforcing deprecation introduced in GH#23539 raise TypeError(f"dtype {data.dtype} cannot be converted to datetime64[ns]") - elif is_period_dtype(data): + elif is_period_dtype(data.dtype): # Note: without explicitly raising here, PeriodIndex # test_setops.test_join_does_not_recur fails raise TypeError( "Passing PeriodDtype data is invalid. Use `data.to_timestamp()` instead" ) - elif is_categorical_dtype(data): + elif is_categorical_dtype(data.dtype): # GH#18664 preserve tz in going DTI->Categorical->DTI # TODO: cases where we need to do another pass through this func, # e.g. the categories are timedelta64s data = data.categories.take(data.codes, fill_value=NaT)._values copy = False - elif is_extension_array_dtype(data) and not is_datetime64tz_dtype(data): + elif is_extension_array_dtype(data.dtype) and not is_datetime64tz_dtype(data.dtype): # Includes categorical # TODO: We have no tests for these data = np.array(data, dtype=np.object_) @@ -2001,7 +2174,7 @@ def _validate_dt64_dtype(dtype): ) raise ValueError(msg) - if (isinstance(dtype, np.dtype) and dtype != _NS_DTYPE) or not isinstance( + if (isinstance(dtype, np.dtype) and dtype != DT64NS_DTYPE) or not isinstance( dtype, (np.dtype, DatetimeTZDtype) ): raise ValueError( @@ -2080,11 +2253,11 @@ def _infer_tz_from_endpoints(start, end, tz): """ try: inferred_tz = timezones.infer_tzinfo(start, end) - except AssertionError: + except AssertionError as err: # infer_tzinfo raises AssertionError if passed mismatched timezones raise TypeError( "Start and end cannot both be tz-aware with different timezones" - ) + ) from err inferred_tz = timezones.maybe_get_tz(inferred_tz) tz = timezones.maybe_get_tz(tz) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index f1e0882def13b..37620edfd9a95 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -13,6 +13,7 @@ from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( is_bool_dtype, + is_datetime64_dtype, is_float, is_float_dtype, is_integer, @@ -26,7 +27,7 @@ from pandas.core.dtypes.missing import isna from pandas.core import nanops, ops -import pandas.core.common as com +from pandas.core.array_algos import masked_reductions from pandas.core.indexers import check_array_indexer from pandas.core.ops import invalid_comparison from pandas.core.ops.common import unpack_zerodim_and_defer @@ -154,7 +155,7 @@ def safe_cast(values, dtype, copy: bool): """ try: return values.astype(dtype, casting="safe", copy=copy) - except TypeError: + except TypeError as err: casted = values.astype(dtype, copy=copy) if (casted == values).all(): @@ -162,7 +163,7 @@ def safe_cast(values, dtype, copy: bool): raise TypeError( f"cannot safely cast non-equivalent {values.dtype} to {np.dtype(dtype)}" - ) + ) from err def coerce_to_array( @@ -199,8 +200,8 @@ def coerce_to_array( if not issubclass(type(dtype), _IntegerDtype): try: dtype = _dtypes[str(np.dtype(dtype))] - except KeyError: - raise ValueError(f"invalid dtype specified {dtype}") + except KeyError as err: + raise ValueError(f"invalid dtype specified {dtype}") from err if isinstance(values, IntegerArray): values, mask = values._data, values._mask @@ -341,15 +342,10 @@ def dtype(self) -> _IntegerDtype: return _dtypes[str(self._data.dtype)] def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): - if not (isinstance(values, np.ndarray) and is_integer_dtype(values.dtype)): + if not (isinstance(values, np.ndarray) and values.dtype.kind in ["i", "u"]): raise TypeError( "values should be integer numpy array. Use " - "the 'integer_array' function instead" - ) - if not (isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype)): - raise TypeError( - "mask should be boolean numpy array. Use " - "the 'integer_array' function instead" + "the 'pd.array' function instead" ) super().__init__(values, mask, copy=copy) @@ -469,24 +465,14 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: if is_float_dtype(dtype): # In astype, we consider dtype=float to also mean na_value=np.nan kwargs = dict(na_value=np.nan) + elif is_datetime64_dtype(dtype): + kwargs = dict(na_value=np.datetime64("NaT")) else: kwargs = {} data = self.to_numpy(dtype=dtype, **kwargs) return astype_nansafe(data, dtype, copy=False) - @property - def _ndarray_values(self) -> np.ndarray: - """ - Internal pandas method for lossy conversion to a NumPy ndarray. - - This method is not part of the pandas interface. - - The expectation is that this is cheap to compute, and is primarily - used for interacting with our indexers. - """ - return self._data - def _values_for_factorize(self) -> Tuple[np.ndarray, float]: # TODO: https://github.com/pandas-dev/pandas/issues/30037 # use masked algorithms, rather than object-dtype / np.nan. @@ -507,7 +493,8 @@ def _values_for_argsort(self) -> np.ndarray: ExtensionArray.argsort """ data = self._data.copy() - data[self._mask] = data.min() - 1 + if self._mask.any(): + data[self._mask] = data.min() - 1 return data @classmethod @@ -569,6 +556,10 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): data = self._data mask = self._mask + if name in {"sum", "prod", "min", "max"}: + op = getattr(masked_reductions, name) + return op(data, mask, skipna=skipna, **kwargs) + # coerce to a nan-aware float if needed # (we explicitly use NaN within reductions) if self._hasna: @@ -584,12 +575,6 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): if name in ["any", "all"]: pass - # if we have a preservable numeric op, - # provide coercion back to an integer type if possible - elif name in ["sum", "min", "max", "prod"]: - # GH#31409 more performant than casting-then-checking - result = com.cast_scalar_indexer(result) - return result def _maybe_mask_result(self, result, mask, other, op_name: str): diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index f5167f470b056..220b70ff71b28 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -29,7 +29,6 @@ ABCDatetimeIndex, ABCExtensionArray, ABCIndexClass, - ABCInterval, ABCIntervalIndex, ABCPeriodIndex, ABCSeries, @@ -153,7 +152,7 @@ class IntervalArray(IntervalMixin, ExtensionArray): def __new__(cls, data, closed=None, dtype=None, copy=False, verify_integrity=True): if isinstance(data, ABCSeries) and is_interval_dtype(data): - data = data.values + data = data._values if isinstance(data, (cls, ABCIntervalIndex)): left = data.left @@ -448,12 +447,12 @@ def from_tuples(cls, data, closed="right", copy=False, dtype=None): try: # need list of length 2 tuples, e.g. [(0, 1), (1, 2), ...] lhs, rhs = d - except ValueError: + except ValueError as err: msg = f"{name}.from_tuples requires tuples of length 2, got {d}" - raise ValueError(msg) - except TypeError: + raise ValueError(msg) from err + except TypeError as err: msg = f"{name}.from_tuples received an invalid item, {d}" - raise TypeError(msg) + raise TypeError(msg) from err left.append(lhs) right.append(rhs) @@ -529,7 +528,7 @@ def __setitem__(self, key, value): value_left, value_right = value, value # scalar interval - elif is_interval_dtype(value) or isinstance(value, ABCInterval): + elif is_interval_dtype(value) or isinstance(value, Interval): self._check_closed_matches(value, name="value") value_left, value_right = value.left, value.right @@ -538,24 +537,24 @@ def __setitem__(self, key, value): try: array = IntervalArray(value) value_left, value_right = array.left, array.right - except TypeError: + except TypeError as err: # wrong type: not interval or NA msg = f"'value' should be an interval type, got {type(value)} instead." - raise TypeError(msg) + raise TypeError(msg) from err + + if needs_float_conversion: + raise ValueError("Cannot set float NaN to integer-backed IntervalArray") key = check_array_indexer(self, key) + # Need to ensure that left and right are updated atomically, so we're # forced to copy, update the copy, and swap in the new values. left = self.left.copy(deep=True) - if needs_float_conversion: - left = left.astype("float") - left.values[key] = value_left + left._values[key] = value_left self._left = left right = self.right.copy(deep=True) - if needs_float_conversion: - right = right.astype("float") - right.values[key] = value_right + right._values[key] = value_right self._right = right def __eq__(self, other): @@ -642,7 +641,7 @@ def fillna(self, value=None, method=None, limit=None): if limit is not None: raise TypeError("limit is not supported for IntervalArray.") - if not isinstance(value, ABCInterval): + if not isinstance(value, Interval): msg = ( "'IntervalArray.fillna' only supports filling with a " f"scalar 'pandas.Interval'. Got a '{type(value).__name__}' instead." @@ -688,20 +687,20 @@ def astype(self, dtype, copy=True): try: new_left = self.left.astype(dtype.subtype) new_right = self.right.astype(dtype.subtype) - except TypeError: + except TypeError as err: msg = ( f"Cannot convert {self.dtype} to {dtype}; subtypes are incompatible" ) - raise TypeError(msg) + raise TypeError(msg) from err return self._shallow_copy(new_left, new_right) elif is_categorical_dtype(dtype): return Categorical(np.asarray(self)) # TODO: This try/except will be repeated. try: return np.asarray(self).astype(dtype, copy=copy) - except (TypeError, ValueError): + except (TypeError, ValueError) as err: msg = f"Cannot cast {type(self).__name__} to dtype {dtype}" - raise TypeError(msg) + raise TypeError(msg) from err @classmethod def _concat_same_type(cls, to_concat): @@ -1020,13 +1019,13 @@ def length(self): """ try: return self.right - self.left - except TypeError: + except TypeError as err: # length not defined for some types, e.g. string msg = ( "IntervalArray contains Intervals without defined length, " "e.g. Intervals with string endpoints" ) - raise TypeError(msg) + raise TypeError(msg) from err @property def mid(self): @@ -1100,11 +1099,11 @@ def __arrow_array__(self, type=None): try: subtype = pyarrow.from_numpy_dtype(self.dtype.subtype) - except TypeError: + except TypeError as err: raise TypeError( f"Conversion to arrow with subtype '{self.dtype.subtype}' " "is not supported" - ) + ) from err interval_type = ArrowIntervalType(subtype, self.closed) storage_array = pyarrow.StructArray.from_arrays( [ diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 47892b55b3ce8..fc5b307bd5754 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -30,6 +30,17 @@ class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin): _internal_fill_value: Scalar def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): + # values is supposed to already be validated in the subclass + if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_): + raise TypeError( + "mask should be boolean numpy array. Use " + "the 'pd.array' function instead" + ) + if not values.ndim == 1: + raise ValueError("values must be a 1D array") + if not mask.ndim == 1: + raise ValueError("mask must be a 1D array") + if copy: values = values.copy() mask = mask.copy() @@ -94,7 +105,7 @@ def to_numpy( >>> a = pd.array([True, False, pd.NA], dtype="boolean") >>> a.to_numpy() - array([True, False, NA], dtype=object) + array([True, False, ], dtype=object) When no missing values are present, an equivalent dtype can be used. @@ -110,7 +121,7 @@ def to_numpy( >>> a = pd.array([True, False, pd.NA], dtype="boolean") >>> a - [True, False, NA] + [True, False, ] Length: 3, dtype: boolean >>> a.to_numpy(dtype="bool") @@ -244,11 +255,11 @@ def value_counts(self, dropna: bool = True) -> "Series": # TODO(extension) # if we have allow Index to hold an ExtensionArray # this is easier - index = value_counts.index.values.astype(object) + index = value_counts.index._values.astype(object) # if we want nans, count the mask if dropna: - counts = value_counts.values + counts = value_counts._values else: counts = np.empty(len(value_counts) + 1, dtype="int64") counts[:-1] = value_counts diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 0e64967ce93a6..3058e1d6073f3 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -6,7 +6,7 @@ from pandas._libs import lib from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender +from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.dtypes import ExtensionDtype @@ -435,7 +435,10 @@ def skew(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): # ------------------------------------------------------------------------ # Additional Methods - def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default): + + def to_numpy( + self, dtype=None, copy: bool = False, na_value=lib.no_default + ) -> np.ndarray: result = np.asarray(self._ndarray, dtype=dtype) if (copy or na_value is not lib.no_default) and result is self._ndarray: @@ -446,7 +449,7 @@ def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default): return result - @Appender(ExtensionArray.searchsorted.__doc__) + @doc(ExtensionArray.searchsorted) def searchsorted(self, value, side="left", sorter=None): return searchsorted(self.to_numpy(), value, side=side, sorter=sorter) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 8141e2c78a7e2..99d9d69d66ec2 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -23,7 +23,7 @@ from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( - _TD_DTYPE, + TD64NS_DTYPE, ensure_object, is_datetime64_dtype, is_float_dtype, @@ -31,13 +31,7 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import PeriodDtype -from pandas.core.dtypes.generic import ( - ABCIndexClass, - ABCPeriod, - ABCPeriodArray, - ABCPeriodIndex, - ABCSeries, -) +from pandas.core.dtypes.generic import ABCIndexClass, ABCPeriodIndex, ABCSeries from pandas.core.dtypes.missing import isna, notna import pandas.core.algorithms as algos @@ -48,7 +42,7 @@ from pandas.tseries.offsets import DateOffset, Tick, _delta_to_tick -def _field_accessor(name, alias, docstring=None): +def _field_accessor(name: str, alias: int, docstring=None): def f(self): base, mult = libfrequencies.get_freq_code(self.freq) result = get_period_field_arr(alias, self.asi8, base) @@ -170,9 +164,10 @@ def __init__(self, values, freq=None, dtype=None, copy=False): self._dtype = PeriodDtype(freq) @classmethod - def _simple_new(cls, values: np.ndarray, freq=None, **kwargs): + def _simple_new(cls, values: np.ndarray, freq=None, **kwargs) -> "PeriodArray": # alias for PeriodArray.__init__ - assert isinstance(values, np.ndarray) and values.dtype == "i8" + assertion_msg = "Should be numpy array of type i8" + assert isinstance(values, np.ndarray) and values.dtype == "i8", assertion_msg return cls(values, freq=freq, **kwargs) @classmethod @@ -181,7 +176,7 @@ def _from_sequence( scalars: Sequence[Optional[Period]], dtype: Optional[PeriodDtype] = None, copy: bool = False, - ) -> ABCPeriodArray: + ) -> "PeriodArray": if dtype: freq = dtype.freq else: @@ -191,6 +186,7 @@ def _from_sequence( validate_dtype_freq(scalars.dtype, freq) if copy: scalars = scalars.copy() + assert isinstance(scalars, PeriodArray) # for mypy return scalars periods = np.asarray(scalars, dtype=object) @@ -202,11 +198,13 @@ def _from_sequence( return cls(ordinals, freq=freq) @classmethod - def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + def _from_sequence_of_strings( + cls, strings, dtype=None, copy=False + ) -> "PeriodArray": return cls._from_sequence(strings, dtype, copy) @classmethod - def _from_datetime64(cls, data, freq, tz=None): + def _from_datetime64(cls, data, freq, tz=None) -> "PeriodArray": """ Construct a PeriodArray from a datetime64 array @@ -270,19 +268,24 @@ def _check_compatible_with(self, other, setitem: bool = False): # Data / Attributes @cache_readonly - def dtype(self): + def dtype(self) -> PeriodDtype: return self._dtype # error: Read-only property cannot override read-write property [misc] @property # type: ignore - def freq(self): + def freq(self) -> DateOffset: """ Return the frequency object for this PeriodArray. """ return self.dtype.freq def __array__(self, dtype=None) -> np.ndarray: - # overriding DatetimelikeArray + if dtype == "i8": + return self.asi8 + elif dtype == bool: + return ~self._isnan + + # This will raise TypeErorr for non-object dtypes return np.array(list(self), dtype=object) def __arrow_array__(self, type=None): @@ -397,7 +400,7 @@ def __arrow_array__(self, type=None): daysinmonth = days_in_month @property - def is_leap_year(self): + def is_leap_year(self) -> np.ndarray: """ Logical indicating if the date belongs to a leap year. """ @@ -451,13 +454,7 @@ def to_timestamp(self, freq=None, how="start"): new_data = self.asfreq(freq, how=how) new_data = libperiod.periodarr_to_dt64arr(new_data.asi8, base) - return DatetimeArray._from_sequence(new_data, freq="infer") - - # -------------------------------------------------------------------- - # Array-like / EA-Interface Methods - - def _values_for_argsort(self): - return self._data + return DatetimeArray(new_data)._with_freq("infer") # -------------------------------------------------------------------- @@ -490,7 +487,7 @@ def _time_shift(self, periods, freq=None): def _box_func(self): return lambda x: Period._from_ordinal(ordinal=x, freq=self.freq) - def asfreq(self, freq=None, how="E"): + def asfreq(self, freq=None, how="E") -> "PeriodArray": """ Convert the Period Array/Index to the specified frequency `freq`. @@ -552,7 +549,7 @@ def asfreq(self, freq=None, how="E"): # ------------------------------------------------------------------ # Rendering Methods - def _formatter(self, boxed=False): + def _formatter(self, boxed: bool = False): if boxed: return str return "'{}'".format @@ -579,7 +576,7 @@ def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): # ------------------------------------------------------------------ - def astype(self, dtype, copy=True): + def astype(self, dtype, copy: bool = True): # We handle Period[T] -> Period[U] # Our parent handles everything else. dtype = pandas_dtype(dtype) @@ -652,10 +649,11 @@ def _add_timedeltalike_scalar(self, other): Returns ------- - result : ndarray[int64] + PeriodArray """ - assert isinstance(self.freq, Tick) # checked by calling function - assert isinstance(other, (timedelta, np.timedelta64, Tick)) + if not isinstance(self.freq, Tick): + # We cannot add timedelta-like to non-tick PeriodArray + raise raise_on_incompatible(self, other) if notna(other): # special handling for np.timedelta64("NaT"), avoid calling @@ -665,10 +663,9 @@ def _add_timedeltalike_scalar(self, other): # Note: when calling parent class's _add_timedeltalike_scalar, # it will call delta_to_nanoseconds(delta). Because delta here # is an integer, delta_to_nanoseconds will return it unchanged. - ordinals = super()._add_timedeltalike_scalar(other) - return ordinals + return super()._add_timedeltalike_scalar(other) - def _add_delta_tdi(self, other): + def _add_timedelta_arraylike(self, other): """ Parameters ---------- @@ -678,7 +675,9 @@ def _add_delta_tdi(self, other): ------- result : ndarray[int64] """ - assert isinstance(self.freq, Tick) # checked by calling function + if not isinstance(self.freq, Tick): + # We cannot add timedelta-like to non-tick PeriodArray + raise raise_on_incompatible(self, other) if not np.all(isna(other)): delta = self._check_timedeltalike_freq_compat(other) @@ -686,28 +685,8 @@ def _add_delta_tdi(self, other): # all-NaT TimedeltaIndex is equivalent to a single scalar td64 NaT return self + np.timedelta64("NaT") - return self._addsub_int_array(delta, operator.add).asi8 - - def _add_delta(self, other): - """ - Add a timedelta-like, Tick, or TimedeltaIndex-like object - to self, yielding a new PeriodArray - - Parameters - ---------- - other : {timedelta, np.timedelta64, Tick, - TimedeltaIndex, ndarray[timedelta64]} - - Returns - ------- - result : PeriodArray - """ - if not isinstance(self.freq, Tick): - # We cannot add timedelta-like to non-tick PeriodArray - raise raise_on_incompatible(self, other) - - new_ordinals = super()._add_delta(other) - return type(self)(new_ordinals, freq=self.freq) + ordinals = self._addsub_int_array(delta, operator.add).asi8 + return type(self)(ordinals, dtype=self.dtype) def _check_timedeltalike_freq_compat(self, other): """ @@ -739,10 +718,10 @@ def _check_timedeltalike_freq_compat(self, other): elif isinstance(other, np.ndarray): # numpy timedelta64 array; all entries must be compatible assert other.dtype.kind == "m" - if other.dtype != _TD_DTYPE: + if other.dtype != TD64NS_DTYPE: # i.e. non-nano unit # TODO: disallow unit-less timedelta64 - other = other.astype(_TD_DTYPE) + other = other.astype(TD64NS_DTYPE) nanos = other.view("i8") else: # TimedeltaArray/Index @@ -839,6 +818,7 @@ def period_array( Integers that look like years are handled >>> period_array([2000, 2001, 2002], freq='D') + ['2000-01-01', '2001-01-01', '2002-01-01'] Length: 3, dtype: period[D] @@ -851,11 +831,11 @@ def period_array( """ if is_datetime64_dtype(data): return PeriodArray._from_datetime64(data, freq) - if isinstance(data, (ABCPeriodIndex, ABCSeries, PeriodArray)): + if is_period_dtype(data): return PeriodArray(data, freq) # other iterable of some kind - if not isinstance(data, (np.ndarray, list, tuple)): + if not isinstance(data, (np.ndarray, list, tuple, ABCSeries)): data = list(data) data = np.asarray(data) @@ -960,8 +940,8 @@ def _get_ordinal_range(start, end, periods, freq, mult=1): if end is not None: end = Period(end, freq) - is_start_per = isinstance(start, ABCPeriod) - is_end_per = isinstance(end, ABCPeriod) + is_start_per = isinstance(start, Period) + is_end_per = isinstance(end, Period) if is_start_per and is_end_per and start.freq != end.freq: raise ValueError("start and end must have same freq") diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 92c05f44d677c..8a30d2b954b55 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -67,24 +67,25 @@ def from_coo(cls, A, dense_index=False): Examples -------- >>> from scipy import sparse - >>> A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), - shape=(3, 4)) + + >>> A = sparse.coo_matrix( + ... ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4) + ... ) >>> A <3x4 sparse matrix of type '' - with 3 stored elements in COOrdinate format> + with 3 stored elements in COOrdinate format> + >>> A.todense() - matrix([[ 0., 0., 1., 2.], - [ 3., 0., 0., 0.], - [ 0., 0., 0., 0.]]) + matrix([[0., 0., 1., 2.], + [3., 0., 0., 0.], + [0., 0., 0., 0.]]) + >>> ss = pd.Series.sparse.from_coo(A) >>> ss - 0 2 1 - 3 2 - 1 0 3 - dtype: float64 - BlockIndex - Block locations: array([0], dtype=int32) - Block lengths: array([3], dtype=int32) + 0 2 1.0 + 3 2.0 + 1 0 3.0 + dtype: Sparse[float64, nan] """ from pandas.core.arrays.sparse.scipy_sparse import _coo_to_sparse_series from pandas import Series @@ -119,24 +120,49 @@ def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False): Examples -------- >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan]) - >>> s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), - (1, 2, 'a', 1), - (1, 1, 'b', 0), - (1, 1, 'b', 1), - (2, 1, 'b', 0), - (2, 1, 'b', 1)], - names=['A', 'B', 'C', 'D']) + >>> s.index = pd.MultiIndex.from_tuples( + ... [ + ... (1, 2, "a", 0), + ... (1, 2, "a", 1), + ... (1, 1, "b", 0), + ... (1, 1, "b", 1), + ... (2, 1, "b", 0), + ... (2, 1, "b", 1) + ... ], + ... names=["A", "B", "C", "D"], + ... ) + >>> s + A B C D + 1 2 a 0 3.0 + 1 NaN + 1 b 0 1.0 + 1 3.0 + 2 1 b 0 NaN + 1 NaN + dtype: float64 + >>> ss = s.astype("Sparse") - >>> A, rows, columns = ss.sparse.to_coo(row_levels=['A', 'B'], - ... column_levels=['C', 'D'], - ... sort_labels=True) + >>> ss + A B C D + 1 2 a 0 3.0 + 1 NaN + 1 b 0 1.0 + 1 3.0 + 2 1 b 0 NaN + 1 NaN + dtype: Sparse[float64, nan] + + >>> A, rows, columns = ss.sparse.to_coo( + ... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True + ... ) >>> A <3x4 sparse matrix of type '' - with 3 stored elements in COOrdinate format> + with 3 stored elements in COOrdinate format> >>> A.todense() - matrix([[ 0., 0., 1., 3.], - [ 3., 0., 0., 0.], - [ 0., 0., 0., 0.]]) + matrix([[0., 0., 1., 3.], + [3., 0., 0., 0.], + [0., 0., 0., 0.]]) + >>> rows [(1, 1), (1, 2), (2, 1)] >>> columns @@ -228,14 +254,29 @@ def from_spmatrix(cls, data, index=None, columns=None): 2 0.0 0.0 1.0 """ from pandas import DataFrame + from pandas._libs.sparse import IntIndex data = data.tocsc() index, columns = cls._prep_index(data, index, columns) - sparrays = [SparseArray.from_spmatrix(data[:, i]) for i in range(data.shape[1])] - data = dict(enumerate(sparrays)) - result = DataFrame(data, index=index) - result.columns = columns - return result + n_rows, n_columns = data.shape + # We need to make sure indices are sorted, as we create + # IntIndex with no input validation (i.e. check_integrity=False ). + # Indices may already be sorted in scipy in which case this adds + # a small overhead. + data.sort_indices() + indices = data.indices + indptr = data.indptr + array_data = data.data + dtype = SparseDtype(array_data.dtype, 0) + arrays = [] + for i in range(n_columns): + sl = slice(indptr[i], indptr[i + 1]) + idx = IntIndex(n_rows, indices[sl], check_integrity=False) + arr = SparseArray._simple_new(array_data[sl], idx, dtype) + arrays.append(arr) + return DataFrame._from_arrays( + arrays, columns=columns, index=index, verify_integrity=False + ) def to_dense(self): """ @@ -314,12 +355,17 @@ def density(self) -> float: @staticmethod def _prep_index(data, index, columns): import pandas.core.indexes.base as ibase + from pandas.core.indexes.api import ensure_index N, K = data.shape if index is None: index = ibase.default_index(N) + else: + index = ensure_index(index) if columns is None: columns = ibase.default_index(K) + else: + columns = ensure_index(columns) if len(columns) != K: raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}") diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 542cfd334b810..a98875ace09aa 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -27,6 +27,7 @@ is_array_like, is_bool_dtype, is_datetime64_any_dtype, + is_datetime64tz_dtype, is_dtype_equal, is_integer, is_object_dtype, @@ -42,7 +43,7 @@ from pandas.core.arrays.sparse.dtype import SparseDtype from pandas.core.base import PandasObject import pandas.core.common as com -from pandas.core.construction import sanitize_array +from pandas.core.construction import extract_array, sanitize_array from pandas.core.indexers import check_array_indexer from pandas.core.missing import interpolate_2d import pandas.core.ops as ops @@ -231,7 +232,7 @@ class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype` is not a ``SparseDtype`` and `data` is a ``SparseArray``. - kind : {'integer', 'block'}, default 'integer' + kind : {'int', 'block'}, default 'int' The type of storage for sparse locations. * 'block': Stores a `block` and `block_length` for each @@ -255,9 +256,18 @@ class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): Methods ------- None + + Examples + -------- + >>> from pandas.arrays import SparseArray + >>> arr = SparseArray([0, 0, 1, 2]) + >>> arr + [0, 0, 1, 2] + Fill: 0 + IntIndex + Indices: array([2, 3], dtype=int32) """ - _pandas_ftype = "sparse" _subtyp = "sparse_array" # register ABCSparseArray _deprecations = PandasObject._deprecations | frozenset(["get_values"]) _sparse_index: SparseIndex @@ -302,7 +312,7 @@ def __init__( dtype = dtype.subtype if index is not None and not is_scalar(data): - raise Exception("must only pass scalars with an index ") + raise Exception("must only pass scalars with an index") if is_scalar(data): if index is not None: @@ -357,6 +367,19 @@ def __init__( sparse_index = data._sparse_index sparse_values = np.asarray(data.sp_values, dtype=dtype) elif sparse_index is None: + data = extract_array(data, extract_numpy=True) + if not isinstance(data, np.ndarray): + # EA + if is_datetime64tz_dtype(data.dtype): + warnings.warn( + f"Creating SparseArray from {data.dtype} data " + "loses timezone information. Cast to object before " + "sparse to retain timezone information.", + UserWarning, + stacklevel=2, + ) + data = np.asarray(data, dtype="datetime64[ns]") + data = np.asarray(data) sparse_values, sparse_index, fill_value = make_sparse( data, kind=kind, fill_value=fill_value, dtype=dtype ) @@ -375,7 +398,7 @@ def __init__( def _simple_new( cls, sparse_array: np.ndarray, sparse_index: SparseIndex, dtype: SparseDtype ) -> "SparseArray": - new = cls([]) + new = object.__new__(cls) new._sparse_index = sparse_index new._sparse_values = sparse_array new._dtype = dtype @@ -1025,7 +1048,7 @@ def astype(self, dtype=None, copy=True): Examples -------- - >>> arr = SparseArray([0, 0, 1, 2]) + >>> arr = pd.arrays.SparseArray([0, 0, 1, 2]) >>> arr [0, 0, 1, 2] Fill: 0 @@ -1043,8 +1066,8 @@ def astype(self, dtype=None, copy=True): >>> arr.astype(np.dtype('float64')) ... # doctest: +NORMALIZE_WHITESPACE - [0, 0, 1.0, 2.0] - Fill: 0 + [0.0, 0.0, 1.0, 2.0] + Fill: 0.0 IntIndex Indices: array([2, 3], dtype=int32) @@ -1084,19 +1107,19 @@ def map(self, mapper): Examples -------- >>> arr = pd.arrays.SparseArray([0, 1, 2]) - >>> arr.apply(lambda x: x + 10) + >>> arr.map(lambda x: x + 10) [10, 11, 12] Fill: 10 IntIndex Indices: array([1, 2], dtype=int32) - >>> arr.apply({0: 10, 1: 11, 2: 12}) + >>> arr.map({0: 10, 1: 11, 2: 12}) [10, 11, 12] Fill: 10 IntIndex Indices: array([1, 2], dtype=int32) - >>> arr.apply(pd.Series([10, 11, 12], index=[0, 1, 2])) + >>> arr.map(pd.Series([10, 11, 12], index=[0, 1, 2])) [10, 11, 12] Fill: 10 IntIndex @@ -1286,14 +1309,14 @@ def mean(self, axis=0, *args, **kwargs): nsparse = self.sp_index.ngaps return (sp_sum + self.fill_value * nsparse) / (ct + nsparse) - def transpose(self, *axes): + def transpose(self, *axes) -> "SparseArray": """ Returns the SparseArray. """ return self @property - def T(self): + def T(self) -> "SparseArray": """ Returns the SparseArray. """ @@ -1487,7 +1510,7 @@ def _formatter(self, boxed=False): SparseArray._add_unary_ops() -def make_sparse(arr, kind="block", fill_value=None, dtype=None, copy=False): +def make_sparse(arr: np.ndarray, kind="block", fill_value=None, dtype=None, copy=False): """ Convert ndarray to sparse format @@ -1503,7 +1526,7 @@ def make_sparse(arr, kind="block", fill_value=None, dtype=None, copy=False): ------- (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar) """ - arr = com.values_from_object(arr) + assert isinstance(arr, np.ndarray) if arr.ndim > 1: raise TypeError("expected dimension <= 1 data") diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index 86869f50aab8e..afa11586fda04 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -217,8 +217,8 @@ def construct_from_string(cls, string: str) -> "SparseDtype": if string.startswith("Sparse"): try: sub_type, has_fill_value = cls._parse_subtype(string) - except ValueError: - raise TypeError(msg) + except ValueError as err: + raise TypeError(msg) from err else: result = SparseDtype(sub_type) msg = ( @@ -347,7 +347,7 @@ def _subtype_with_str(self): dtype('O') >>> dtype._subtype_with_str - str + """ if isinstance(self.fill_value, str): return type(self.fill_value) diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index e77256a5aaadd..eafd782dc9b9c 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -134,8 +134,10 @@ def _coo_to_sparse_series(A, dense_index: bool = False): try: s = Series(A.data, MultiIndex.from_arrays((A.row, A.col))) - except AttributeError: - raise TypeError(f"Expected coo_matrix. Got {type(A).__name__} instead.") + except AttributeError as err: + raise TypeError( + f"Expected coo_matrix. Got {type(A).__name__} instead." + ) from err s = s.sort_index() s = s.astype(SparseDtype(s.dtype)) if dense_index: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index fcccd8cc14d6b..dbca8e74f5e1b 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -13,7 +13,8 @@ from pandas import compat from pandas.core import ops -from pandas.core.arrays import PandasArray +from pandas.core.arrays import IntegerArray, PandasArray +from pandas.core.arrays.integer import _IntegerDtype from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer from pandas.core.missing import isna @@ -271,6 +272,13 @@ def astype(self, dtype, copy=True): if copy: return self.copy() return self + elif isinstance(dtype, _IntegerDtype): + arr = self._ndarray.copy() + mask = self.isna() + arr[mask] = 0 + values = arr.astype(dtype.numpy_dtype) + return IntegerArray(values, mask, copy=False) + return super().astype(dtype, copy) def _reduce(self, name, skipna=True, **kwargs): @@ -281,7 +289,7 @@ def value_counts(self, dropna=False): return value_counts(self._ndarray, dropna=dropna).astype("Int64") - # Overrride parent because we have different return types. + # Override parent because we have different return types. @classmethod def _create_arithmetic_method(cls, op): # Note: this handles both arithmetic and comparison methods. diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index a7b16fd86468e..8c93dca783113 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -14,8 +14,8 @@ from pandas.compat.numpy import function as nv from pandas.core.dtypes.common import ( - _NS_DTYPE, - _TD_DTYPE, + DT64NS_DTYPE, + TD64NS_DTYPE, is_dtype_equal, is_float_dtype, is_integer_dtype, @@ -39,6 +39,7 @@ from pandas.core.algorithms import checked_add_with_arr from pandas.core.arrays import datetimelike as dtl import pandas.core.common as com +from pandas.core.construction import extract_array from pandas.tseries.frequencies import to_offset from pandas.tseries.offsets import Tick @@ -135,14 +136,13 @@ def dtype(self): ------- numpy.dtype """ - return _TD_DTYPE + return TD64NS_DTYPE # ---------------------------------------------------------------- # Constructors - def __init__(self, values, dtype=_TD_DTYPE, freq=None, copy=False): - if isinstance(values, (ABCSeries, ABCIndexClass)): - values = values._values + def __init__(self, values, dtype=TD64NS_DTYPE, freq=None, copy=False): + values = extract_array(values) inferred_freq = getattr(values, "_freq", None) @@ -167,7 +167,7 @@ def __init__(self, values, dtype=_TD_DTYPE, freq=None, copy=False): # for compat with datetime/timedelta/period shared methods, # we can sometimes get here with int64 values. These represent # nanosecond UTC (or tz-naive) unix timestamps - values = values.view(_TD_DTYPE) + values = values.view(TD64NS_DTYPE) _validate_td64_dtype(values.dtype) dtype = _validate_td64_dtype(dtype) @@ -192,21 +192,21 @@ def __init__(self, values, dtype=_TD_DTYPE, freq=None, copy=False): type(self)._validate_frequency(self, freq) @classmethod - def _simple_new(cls, values, freq=None, dtype=_TD_DTYPE): - assert dtype == _TD_DTYPE, dtype + def _simple_new(cls, values, freq=None, dtype=TD64NS_DTYPE): + assert dtype == TD64NS_DTYPE, dtype assert isinstance(values, np.ndarray), type(values) - if values.dtype != _TD_DTYPE: + if values.dtype != TD64NS_DTYPE: assert values.dtype == "i8" - values = values.view(_TD_DTYPE) + values = values.view(TD64NS_DTYPE) result = object.__new__(cls) result._data = values result._freq = to_offset(freq) - result._dtype = _TD_DTYPE + result._dtype = TD64NS_DTYPE return result @classmethod - def _from_sequence(cls, data, dtype=_TD_DTYPE, copy=False, freq=None, unit=None): + def _from_sequence(cls, data, dtype=TD64NS_DTYPE, copy=False, freq=None, unit=None): if dtype: _validate_td64_dtype(dtype) freq, freq_infer = dtl.maybe_infer_freq(freq) @@ -258,6 +258,10 @@ def _generate_range(cls, start, end, periods, freq, closed=None): index = _generate_regular_range(start, end, periods, freq) else: index = np.linspace(start.value, end.value, periods).astype("i8") + if len(index) >= 2: + # Infer a frequency + td = Timedelta(index[1] - index[0]) + freq = to_offset(td) if not left_closed: index = index[1:] @@ -385,7 +389,7 @@ def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): from pandas.io.formats.format import _get_format_timedelta64 formatter = _get_format_timedelta64(self._data, na_rep) - return np.array([formatter(x) for x in self._data]) + return np.array([formatter(x) for x in self._data.ravel()]).reshape(self.shape) # ---------------------------------------------------------------- # Arithmetic Methods @@ -396,23 +400,6 @@ def _add_offset(self, other): f"cannot add the type {type(other).__name__} to a {type(self).__name__}" ) - def _add_delta(self, delta): - """ - Add a timedelta-like, Tick, or TimedeltaIndex-like object - to self, yielding a new TimedeltaArray. - - Parameters - ---------- - other : {timedelta, np.timedelta64, Tick, - TimedeltaIndex, ndarray[timedelta64]} - - Returns - ------- - result : TimedeltaArray - """ - new_values = super()._add_delta(delta) - return type(self)._from_sequence(new_values, freq="infer") - def _add_datetime_arraylike(self, other): """ Add DatetimeArray/Index or ndarray[datetime64] to TimedeltaArray. @@ -441,7 +428,7 @@ def _add_datetimelike_scalar(self, other): i8 = self.asi8 result = checked_add_with_arr(i8, other.value, arr_mask=self._isnan) result = self._maybe_mask_results(result) - dtype = DatetimeTZDtype(tz=other.tz) if other.tz else _NS_DTYPE + dtype = DatetimeTZDtype(tz=other.tz) if other.tz else DT64NS_DTYPE return DatetimeArray(result, dtype=dtype, freq=self.freq) def _addsub_object_array(self, other, op): @@ -451,10 +438,10 @@ def _addsub_object_array(self, other, op): # subclasses. Incompatible classes will raise AttributeError, # which we re-raise as TypeError return super()._addsub_object_array(other, op) - except AttributeError: + except AttributeError as err: raise TypeError( f"Cannot add/subtract non-tick DateOffset to {type(self).__name__}" - ) + ) from err def __mul__(self, other): other = lib.item_from_zerodim(other) @@ -614,6 +601,10 @@ def __floordiv__(self, other): if self.freq is not None: # Note: freq gets division, not floor-division freq = self.freq / other + if freq.nanos == 0 and self.freq.nanos != 0: + # e.g. if self.freq is Nano(1) then dividing by 2 + # rounds down to zero + freq = None return type(self)(result.view("m8[ns]"), freq=freq) if not hasattr(other, "dtype"): @@ -817,7 +808,7 @@ def total_seconds(self): """ return self._maybe_mask_results(1e-9 * self.asi8, fill_value=None) - def to_pytimedelta(self): + def to_pytimedelta(self) -> np.ndarray: """ Return Timedelta Array/Index as object ndarray of datetime.timedelta objects. @@ -959,10 +950,10 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): copy = False elif is_timedelta64_dtype(data.dtype): - if data.dtype != _TD_DTYPE: + if data.dtype != TD64NS_DTYPE: # non-nano unit # TODO: watch out for overflows - data = data.astype(_TD_DTYPE) + data = data.astype(TD64NS_DTYPE) copy = False else: @@ -1060,7 +1051,7 @@ def _validate_td64_dtype(dtype): ) raise ValueError(msg) - if not is_dtype_equal(dtype, _TD_DTYPE): + if not is_dtype_equal(dtype, TD64NS_DTYPE): raise ValueError(f"dtype {dtype} cannot be converted to timedelta64[ns]") return dtype diff --git a/pandas/core/base.py b/pandas/core/base.py index 85424e35fa0e0..5945d8a4b432d 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -4,17 +4,15 @@ import builtins import textwrap -from typing import Dict, FrozenSet, List, Optional, Union +from typing import Any, Dict, FrozenSet, List, Optional, Union import numpy as np import pandas._libs.lib as lib -from pandas._typing import T from pandas.compat import PYPY from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError -from pandas.util._decorators import Appender, Substitution, cache_readonly, doc -from pandas.util._validators import validate_bool_kwarg +from pandas.util._decorators import cache_readonly, doc from pandas.core.dtypes.cast import is_nested_object from pandas.core.dtypes.common import ( @@ -50,6 +48,8 @@ class PandasObject(DirNamesMixin): Baseclass for various pandas objects. """ + _cache: Dict[str, Any] + @property def _constructor(self): """ @@ -64,7 +64,7 @@ def __repr__(self) -> str: # Should be overwritten by base classes return object.__repr__(self) - def _reset_cache(self, key=None): + def _reset_cache(self, key: Optional[str] = None) -> None: """ Reset cached properties. If ``key`` is passed, only clears that key. """ @@ -87,15 +87,6 @@ def __sizeof__(self): # no memory_usage attribute, so fall back to object's 'sizeof' return super().__sizeof__() - def _ensure_type(self: T, obj) -> T: - """ - Ensure that an object has same type as self. - - Used by type checkers. - """ - assert isinstance(obj, type(self)), type(obj) - return obj - class NoNewAttributesMixin: """ @@ -131,15 +122,11 @@ def __setattr__(self, key: str, value): object.__setattr__(self, key, value) -class GroupByError(Exception): - pass - - -class DataError(GroupByError): +class DataError(Exception): pass -class SpecificationError(GroupByError): +class SpecificationError(Exception): pass @@ -364,7 +351,8 @@ def _aggregate(self, arg, *args, **kwargs): if isinstance(obj, ABCDataFrame) and len( obj.columns.intersection(keys) ) != len(keys): - raise SpecificationError("nested renamer is not supported") + cols = sorted(set(keys) - set(obj.columns.intersection(keys))) + raise SpecificationError(f"Column(s) {cols} do not exist") from pandas.core.reshape.concat import concat @@ -379,7 +367,7 @@ def _agg_1dim(name, how, subset=None): ) return colg.aggregate(how) - def _agg_2dim(name, how): + def _agg_2dim(how): """ aggregate a 2-dim with how """ @@ -458,7 +446,7 @@ def is_any_frame() -> bool: # return a MI Series try: result = concat(result) - except TypeError: + except TypeError as err: # we want to give a nice error here if # we have non-same sized objects, so # we don't automatically broadcast @@ -467,7 +455,7 @@ def is_any_frame() -> bool: "cannot perform both aggregation " "and transformation operations " "simultaneously" - ) + ) from err return result, True @@ -539,7 +527,7 @@ def _aggregate_multiple_funcs(self, arg, _axis): # raised directly in _aggregate_named pass elif "no results" in str(err): - # raised direcly in _aggregate_multiple_funcs + # raised directly in _aggregate_multiple_funcs pass else: raise @@ -553,7 +541,7 @@ def _aggregate_multiple_funcs(self, arg, _axis): try: return concat(results, keys=keys, axis=1, sort=False) - except TypeError: + except TypeError as err: # we are concatting non-NDFrame objects, # e.g. a list of scalars @@ -562,7 +550,9 @@ def _aggregate_multiple_funcs(self, arg, _axis): result = Series(results, index=keys, name=self.name) if is_nested_object(result): - raise ValueError("cannot combine transform and aggregation operations") + raise ValueError( + "cannot combine transform and aggregation operations" + ) from err return result def _get_cython_func(self, arg: str) -> Optional[str]: @@ -665,7 +655,7 @@ def item(self): ): # numpy returns ints instead of datetime64/timedelta64 objects, # which we need to wrap in Timestamp/Timedelta/Period regardless. - return self.values.item() + return self._values.item() if len(self) == 1: return next(iter(self)) @@ -863,23 +853,6 @@ def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default, **kwargs): result[self.isna()] = na_value return result - @property - def _ndarray_values(self) -> np.ndarray: - """ - The data as an ndarray, possibly losing information. - - The expectation is that this is cheap to compute, and is primarily - used for interacting with our indexers. - - - categorical -> codes - """ - if is_extension_array_dtype(self): - return self.array._ndarray_values - # As a mixin, we depend on the mixing class having values. - # Special mixin syntax may be developed in the future: - # https://github.com/python/typing/issues/246 - return self.values # type: ignore - @property def empty(self): return not self.size @@ -893,6 +866,9 @@ def max(self, axis=None, skipna=True, *args, **kwargs): axis : int, optional For compatibility with NumPy. Only 0 or None are allowed. skipna : bool, default True + Exclude NA/null values when showing the result. + *args, **kwargs + Additional arguments and keywords for compatibility with NumPy. Returns ------- @@ -925,24 +901,57 @@ def max(self, axis=None, skipna=True, *args, **kwargs): nv.validate_max(args, kwargs) return nanops.nanmax(self._values, skipna=skipna) + @doc(op="max", oppose="min", value="largest") def argmax(self, axis=None, skipna=True, *args, **kwargs): """ - Return an ndarray of the maximum argument indexer. + Return int position of the {value} value in the Series. + + If the {op}imum is achieved in multiple locations, + the first row position is returned. Parameters ---------- - axis : {None} + axis : {{None}} Dummy argument for consistency with Series. skipna : bool, default True + Exclude NA/null values when showing the result. + *args, **kwargs + Additional arguments and keywords for compatibility with NumPy. Returns ------- - numpy.ndarray - Indices of the maximum values. + int + Row position of the {op}imum value. See Also -------- - numpy.ndarray.argmax + Series.arg{op} : Return position of the {op}imum value. + Series.arg{oppose} : Return position of the {oppose}imum value. + numpy.ndarray.arg{op} : Equivalent method for numpy arrays. + Series.idxmax : Return index label of the maximum values. + Series.idxmin : Return index label of the minimum values. + + Examples + -------- + Consider dataset containing cereal calories + + >>> s = pd.Series({{'Corn Flakes': 100.0, 'Almond Delight': 110.0, + ... 'Cinnamon Toast Crunch': 120.0, 'Cocoa Puff': 110.0}}) + >>> s + Corn Flakes 100.0 + Almond Delight 110.0 + Cinnamon Toast Crunch 120.0 + Cocoa Puff 110.0 + dtype: float64 + + >>> s.argmax() + 2 + >>> s.argmin() + 0 + + The maximum cereal calories is the third element and + the minimum cereal calories is the first element, + since series is zero-indexed. """ nv.validate_minmax_axis(axis) nv.validate_argmax_with_skipna(skipna, args, kwargs) @@ -957,6 +966,9 @@ def min(self, axis=None, skipna=True, *args, **kwargs): axis : {None} Dummy argument for consistency with Series. skipna : bool, default True + Exclude NA/null values when showing the result. + *args, **kwargs + Additional arguments and keywords for compatibility with NumPy. Returns ------- @@ -989,24 +1001,8 @@ def min(self, axis=None, skipna=True, *args, **kwargs): nv.validate_min(args, kwargs) return nanops.nanmin(self._values, skipna=skipna) + @doc(argmax, op="min", oppose="max", value="smallest") def argmin(self, axis=None, skipna=True, *args, **kwargs): - """ - Return a ndarray of the minimum argument indexer. - - Parameters - ---------- - axis : {None} - Dummy argument for consistency with Series. - skipna : bool, default True - - Returns - ------- - numpy.ndarray - - See Also - -------- - numpy.ndarray.argmin - """ nv.validate_minmax_axis(axis) nv.validate_argmax_with_skipna(skipna, args, kwargs) return nanops.nanargmin(self._values, skipna=skipna) @@ -1025,7 +1021,8 @@ def tolist(self): See Also -------- - numpy.ndarray.tolist + numpy.ndarray.tolist : Return the array as an a.ndim-levels deep + nested list of Python scalars. """ if not isinstance(self._values, np.ndarray): # check for ndarray instead of dtype to catch DTA/TDA @@ -1130,10 +1127,8 @@ def _map_values(self, mapper, na_action=None): # use the built in categorical series mapper which saves # time by mapping the categories instead of all values return self._values.map(mapper) - if is_extension_array_dtype(self.dtype): - values = self._values - else: - values = self.values + + values = self._values indexer = mapper.index.get_indexer(values) new_values = algorithms.take_1d(mapper._values, indexer) @@ -1155,8 +1150,14 @@ def _map_values(self, mapper, na_action=None): def map_f(values, f): return lib.map_infer_mask(values, f, isna(values).view(np.uint8)) - else: + elif na_action is None: map_f = lib.map_infer + else: + msg = ( + "na_action must either be 'ignore' or None, " + f"{na_action} was passed" + ) + raise ValueError(msg) # mapper is a function new_values = map_f(values, mapper) @@ -1372,7 +1373,8 @@ def memory_usage(self, deep=False): See Also -------- - numpy.ndarray.nbytes + numpy.ndarray.nbytes : Total bytes consumed by the elements of the + array. Notes ----- @@ -1384,7 +1386,7 @@ def memory_usage(self, deep=False): v = self.array.nbytes if deep and is_object_dtype(self) and not PYPY: - v += lib.memory_usage_of_objects(self.array) + v += lib.memory_usage_of_objects(self._values) return v @doc( @@ -1408,13 +1410,13 @@ def factorize(self, sort=False, na_sentinel=-1): ] = """ Find indices where elements should be inserted to maintain order. - Find the indices into a sorted %(klass)s `self` such that, if the + Find the indices into a sorted {klass} `self` such that, if the corresponding elements in `value` were inserted before the indices, the order of `self` would be preserved. .. note:: - The %(klass)s *must* be monotonically sorted, otherwise + The {klass} *must* be monotonically sorted, otherwise wrong locations will likely be returned. Pandas does *not* check this for you. @@ -1422,7 +1424,7 @@ def factorize(self, sort=False, na_sentinel=-1): ---------- value : array_like Values to insert into `self`. - side : {'left', 'right'}, optional + side : {{'left', 'right'}}, optional If 'left', the index of the first suitable location found is given. If 'right', return the last such index. If there is no suitable index, return either 0 or N (where N is the length of `self`). @@ -1443,8 +1445,8 @@ def factorize(self, sort=False, na_sentinel=-1): See Also -------- - sort_values - numpy.searchsorted + sort_values : Sort by the values along either axis. + numpy.searchsorted : Similar method from NumPy. Notes ----- @@ -1452,61 +1454,64 @@ def factorize(self, sort=False, na_sentinel=-1): Examples -------- - >>> x = pd.Series([1, 2, 3]) - >>> x + >>> ser = pd.Series([1, 2, 3]) + >>> ser 0 1 1 2 2 3 dtype: int64 - >>> x.searchsorted(4) + >>> ser.searchsorted(4) 3 - >>> x.searchsorted([0, 4]) + >>> ser.searchsorted([0, 4]) array([0, 3]) - >>> x.searchsorted([1, 3], side='left') + >>> ser.searchsorted([1, 3], side='left') array([0, 2]) - >>> x.searchsorted([1, 3], side='right') + >>> ser.searchsorted([1, 3], side='right') array([1, 3]) - >>> x = pd.Categorical(['apple', 'bread', 'bread', - 'cheese', 'milk'], ordered=True) + >>> ser = pd.Categorical( + ... ['apple', 'bread', 'bread', 'cheese', 'milk'], ordered=True + ... ) + >>> ser [apple, bread, bread, cheese, milk] Categories (4, object): [apple < bread < cheese < milk] - >>> x.searchsorted('bread') + >>> ser.searchsorted('bread') 1 - >>> x.searchsorted(['bread'], side='right') + >>> ser.searchsorted(['bread'], side='right') array([3]) If the values are not monotonically sorted, wrong locations may be returned: - >>> x = pd.Series([2, 1, 3]) - >>> x.searchsorted(1) + >>> ser = pd.Series([2, 1, 3]) + >>> ser + 0 2 + 1 1 + 2 3 + dtype: int64 + + >>> ser.searchsorted(1) # doctest: +SKIP 0 # wrong result, correct would be 1 """ - @Substitution(klass="Index") - @Appender(_shared_docs["searchsorted"]) + @doc(_shared_docs["searchsorted"], klass="Index") def searchsorted(self, value, side="left", sorter=None) -> np.ndarray: return algorithms.searchsorted(self._values, value, side=side, sorter=sorter) - def drop_duplicates(self, keep="first", inplace=False): - inplace = validate_bool_kwarg(inplace, "inplace") + def drop_duplicates(self, keep="first"): if isinstance(self, ABCIndexClass): if self.is_unique: return self._shallow_copy() duplicated = self.duplicated(keep=keep) result = self[np.logical_not(duplicated)] - if inplace: - return self._update_inplace(result) - else: - return result + return result def duplicated(self, keep="first"): if isinstance(self, ABCIndexClass): @@ -1516,10 +1521,4 @@ def duplicated(self, keep="first"): else: return self._constructor( duplicated(self, keep=keep), index=self.index - ).__finalize__(self) - - # ---------------------------------------------------------------------- - # abstracts - - def _update_inplace(self, result, verify_is_copy=True, **kwargs): - raise AbstractMethodError(self) + ).__finalize__(self, method="duplicated") diff --git a/pandas/core/common.py b/pandas/core/common.py index 705c618fc49dc..8b152162dc95a 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -15,6 +15,7 @@ from pandas._libs import lib, tslibs from pandas._typing import T +from pandas.compat.numpy import _np_version_under1p17 from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( @@ -87,9 +88,6 @@ def maybe_box_datetimelike(value, dtype=None): return value -values_from_object = lib.values_from_object - - def is_bool_indexer(key: Any) -> bool: """ Check whether `key` is a valid boolean indexer. @@ -122,7 +120,7 @@ def is_bool_indexer(key: Any) -> bool: is_array_like(key) and is_extension_array_dtype(key.dtype) ): if key.dtype == np.object_: - key = np.asarray(values_from_object(key)) + key = np.asarray(key) if not lib.is_bool_array(key): na_msg = "Cannot mask with non-boolean array containing NA / NaN values" @@ -215,7 +213,7 @@ def asarray_tuplesafe(values, dtype=None): if not (isinstance(values, (list, tuple)) or hasattr(values, "__array__")): values = list(values) elif isinstance(values, ABCIndexClass): - return values.values + return values._values if isinstance(values, list) and dtype in [np.object_, object]: return construct_1d_object_array_from_listlike(values) @@ -359,8 +357,6 @@ def standardize_mapping(into): """ Helper function to standardize a supplied mapping. - .. versionadded:: 0.21.0 - Parameters ---------- into : instance or subclass of collections.abc.Mapping @@ -395,18 +391,30 @@ def random_state(state=None): Parameters ---------- - state : int, np.random.RandomState, None. - If receives an int, passes to np.random.RandomState() as seed. + state : int, array-like, BitGenerator (NumPy>=1.17), np.random.RandomState, None. + If receives an int, array-like, or BitGenerator, passes to + np.random.RandomState() as seed. If receives an np.random.RandomState object, just returns object. If receives `None`, returns np.random. If receives anything else, raises an informative ValueError. + + ..versionchanged:: 1.1.0 + + array-like and BitGenerator (for NumPy>=1.17) object now passed to + np.random.RandomState() as seed + Default None. Returns ------- np.random.RandomState + """ - if is_integer(state): + if ( + is_integer(state) + or is_array_like(state) + or (not _np_version_under1p17 and isinstance(state, np.random.BitGenerator)) + ): return np.random.RandomState(state) elif isinstance(state, np.random.RandomState): return state @@ -414,7 +422,10 @@ def random_state(state=None): return np.random else: raise ValueError( - "random_state must be an integer, a numpy RandomState, or None" + ( + "random_state must be an integer, array-like, a BitGenerator, " + "a numpy RandomState, or None" + ) ) diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py index 19a8898a2987c..327ec21c3c11c 100644 --- a/pandas/core/computation/common.py +++ b/pandas/core/computation/common.py @@ -24,7 +24,3 @@ def result_type_many(*arrays_and_dtypes): except ValueError: # we have > NPY_MAXARGS terms in our expression return reduce(np.result_type, arrays_and_dtypes) - - -class NameResolutionError(NameError): - pass diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index f6947d5ec6233..b74f99fca21c7 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ Top level ``eval`` module. """ @@ -266,8 +265,10 @@ def eval( See Also -------- - DataFrame.query - DataFrame.eval + DataFrame.query : Evaluates a boolean expression to query the columns + of a frame. + DataFrame.eval : Evaluate a string describing operations on + DataFrame columns. Notes ----- @@ -362,8 +363,8 @@ def eval( if not inplace and first_expr: try: target = env.target.copy() - except AttributeError: - raise ValueError("Cannot return a copy of the target") + except AttributeError as err: + raise ValueError("Cannot return a copy of the target") from err else: target = env.target @@ -375,8 +376,8 @@ def eval( with warnings.catch_warnings(record=True): # TODO: Filter the warnings we actually care about here. target[assigner] = ret - except (TypeError, IndexError): - raise ValueError("Cannot assign expression output to target") + except (TypeError, IndexError) as err: + raise ValueError("Cannot assign expression output to target") from err if not resolvers: resolvers = ({assigner: ret},) diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index c59952bea8dc0..6cd9a15b70d39 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -635,8 +635,9 @@ def visit_Attribute(self, node, **kwargs): # something like datetime.datetime where scope is overridden if isinstance(value, ast.Name) and value.id == attr: return resolved + raise - raise ValueError(f"Invalid Attribute context {ctx.__name__}") + raise ValueError(f"Invalid Attribute context {type(ctx).__name__}") def visit_Call(self, node, side=None, **kwargs): diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index fdc299ccdfde8..7f93472c766d7 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -121,12 +121,12 @@ def _evaluate_numexpr(op, op_str, a, b): def _where_standard(cond, a, b): - # Caller is responsible for calling values_from_object if necessary + # Caller is responsible for extracting ndarray if necessary return np.where(cond, a, b) def _where_numexpr(cond, a, b): - # Caller is responsible for calling values_from_object if necessary + # Caller is responsible for extracting ndarray if necessary result = None if _can_use_numexpr(None, "where", a, b, "where"): diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index 7ed089b283903..bc9ff7c44b689 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -372,12 +372,12 @@ def __init__(self, op: str, lhs, rhs): try: self.func = _binary_ops_dict[op] - except KeyError: + except KeyError as err: # has to be made a list for python3 keys = list(_binary_ops_dict.keys()) raise ValueError( f"Invalid binary operator {repr(op)}, valid operators are {keys}" - ) + ) from err def __call__(self, env): """ @@ -550,11 +550,11 @@ def __init__(self, op: str, operand): try: self.func = _unary_ops_dict[op] - except KeyError: + except KeyError as err: raise ValueError( f"Invalid unary operator {repr(op)}, " f"valid operators are {_unary_ops_syms}" - ) + ) from err def __call__(self, env): operand = self.operand(env) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index 92a2c20cd2a9e..c7c7103654a65 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -116,7 +116,7 @@ def clean_column_name(name: str) -> str: If this name was used in the query string (this makes the query call impossible) an error will be raised by :func:`tokenize_backtick_quoted_string` instead, - which is not catched and propogates to the user level. + which is not caught and propagates to the user level. """ try: tokenized = tokenize_string(f"`{name}`") @@ -185,7 +185,7 @@ def tokenize_string(source: str) -> Iterator[Tuple[int, str]]: yield tokenize_backtick_quoted_string( token_generator, source, string_start=start[1] + 1 ) - except Exception: - raise SyntaxError(f"Failed to parse backticks in '{source}'.") + except Exception as err: + raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err else: yield toknum, tokval diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 828ec11c2bd38..15d9987310f18 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -17,6 +17,7 @@ from pandas.core.computation.common import _ensure_decoded from pandas.core.computation.expr import BaseExprVisitor from pandas.core.computation.ops import UndefinedVariableError, is_term +from pandas.core.construction import extract_array from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded @@ -202,7 +203,7 @@ def stringify(value): v = Timedelta(v, unit="s").value return TermValue(int(v), v, kind) elif meta == "category": - metadata = com.values_from_object(self.metadata) + metadata = extract_array(self.metadata, extract_numpy=True) result = metadata.searchsorted(v, side="left") # result returns 0 if v is first element or if v is not in metadata @@ -424,8 +425,10 @@ def visit_Subscript(self, node, **kwargs): try: return self.const_type(value[slobj], self.env) - except TypeError: - raise ValueError(f"cannot subscript {repr(value)} with {repr(slobj)}") + except TypeError as err: + raise ValueError( + f"cannot subscript {repr(value)} with {repr(slobj)}" + ) from err def visit_Attribute(self, node, **kwargs): attr = node.attr @@ -575,18 +578,18 @@ def evaluate(self): """ create and return the numexpr condition and filter """ try: self.condition = self.terms.prune(ConditionBinOp) - except AttributeError: + except AttributeError as err: raise ValueError( f"cannot process expression [{self.expr}], [{self}] " "is not a valid condition" - ) + ) from err try: self.filter = self.terms.prune(FilterBinOp) - except AttributeError: + except AttributeError as err: raise ValueError( f"cannot process expression [{self.expr}], [{self}] " "is not a valid filter" - ) + ) from err return self.condition, self.filter diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py index 937c81fdeb8d6..83bf92ad737e4 100644 --- a/pandas/core/computation/scope.py +++ b/pandas/core/computation/scope.py @@ -197,11 +197,11 @@ def resolve(self, key: str, is_local: bool): # these are created when parsing indexing expressions # e.g., df[df > 0] return self.temps[key] - except KeyError: + except KeyError as err: # runtime import because ops imports from scope from pandas.core.computation.ops import UndefinedVariableError - raise UndefinedVariableError(key, is_local) + raise UndefinedVariableError(key, is_local) from err def swapkey(self, old_key: str, new_key: str, new_value=None): """ diff --git a/pandas/core/construction.py b/pandas/core/construction.py index f947a1fda49f1..2d60ad9ba50bf 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -4,6 +4,8 @@ These should not depend on core.internals. """ + +from collections import abc from typing import TYPE_CHECKING, Any, Optional, Sequence, Union, cast import numpy as np @@ -185,7 +187,7 @@ def array( >>> pd.array(["1H", "2H"], dtype='timedelta64[ns]') - ['01:00:00', '02:00:00'] + ['0 days 01:00:00', '0 days 02:00:00'] Length: 2, dtype: timedelta64[ns] Examples @@ -200,12 +202,12 @@ def array( >>> pd.array([1, 2, np.nan]) - [1, 2, NaN] + [1, 2, ] Length: 3, dtype: Int64 >>> pd.array(["a", None, "c"]) - ['a', nan, 'c'] + ['a', , 'c'] Length: 3, dtype: string >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) @@ -445,6 +447,8 @@ def sanitize_array( # GH#16804 arr = np.arange(data.start, data.stop, data.step, dtype="int64") subarr = _try_cast(arr, dtype, copy, raise_cast_failure) + elif isinstance(data, abc.Set): + raise TypeError("Set type is unordered") else: subarr = _try_cast(data, dtype, copy, raise_cast_failure) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index c2b600b5d8c5b..7dda6850ba4f7 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -3,6 +3,7 @@ """ from datetime import date, datetime, timedelta +from typing import TYPE_CHECKING, Type import numpy as np @@ -16,14 +17,14 @@ iNaT, ) from pandas._libs.tslibs.timezones import tz_compare -from pandas._typing import Dtype +from pandas._typing import Dtype, DtypeObj from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( _INT64_DTYPE, - _NS_DTYPE, _POSSIBLY_CAST_DTYPES, - _TD_DTYPE, + DT64NS_DTYPE, + TD64NS_DTYPE, ensure_int8, ensure_int16, ensure_int32, @@ -32,6 +33,7 @@ ensure_str, is_bool, is_bool_dtype, + is_categorical_dtype, is_complex, is_complex_dtype, is_datetime64_dtype, @@ -63,6 +65,7 @@ ABCDataFrame, ABCDatetimeArray, ABCDatetimeIndex, + ABCExtensionArray, ABCPeriodArray, ABCPeriodIndex, ABCSeries, @@ -70,6 +73,10 @@ from pandas.core.dtypes.inference import is_list_like from pandas.core.dtypes.missing import isna, notna +if TYPE_CHECKING: + from pandas import Series + from pandas.core.arrays import ExtensionArray # noqa: F401 + _int8_max = np.iinfo(np.int8).max _int16_max = np.iinfo(np.int16).max _int32_max = np.iinfo(np.int32).max @@ -246,6 +253,100 @@ def trans(x): return result +def maybe_cast_result(result, obj: "Series", numeric_only: bool = False, how: str = ""): + """ + Try casting result to a different type if appropriate + + Parameters + ---------- + result : array-like + Result to cast. + obj : Series + Input Series from which result was calculated. + numeric_only : bool, default False + Whether to cast only numerics or datetimes as well. + how : str, default "" + How the result was computed. + + Returns + ------- + result : array-like + result maybe casted to the dtype. + """ + if obj.ndim > 1: + dtype = obj._values.dtype + else: + dtype = obj.dtype + dtype = maybe_cast_result_dtype(dtype, how) + + if not is_scalar(result): + if ( + is_extension_array_dtype(dtype) + and not is_categorical_dtype(dtype) + and dtype.kind != "M" + ): + # We have to special case categorical so as not to upcast + # things like counts back to categorical + cls = dtype.construct_array_type() + result = maybe_cast_to_extension_array(cls, result, dtype=dtype) + + elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: + result = maybe_downcast_to_dtype(result, dtype) + + return result + + +def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj: + """ + Get the desired dtype of a result based on the + input dtype and how it was computed. + + Parameters + ---------- + dtype : DtypeObj + Input dtype. + how : str + How the result was computed. + + Returns + ------- + DtypeObj + The desired dtype of the result. + """ + d = { + (np.dtype(np.bool), "add"): np.dtype(np.int64), + (np.dtype(np.bool), "cumsum"): np.dtype(np.int64), + (np.dtype(np.bool), "sum"): np.dtype(np.int64), + } + return d.get((dtype, how), dtype) + + +def maybe_cast_to_extension_array(cls: Type["ExtensionArray"], obj, dtype=None): + """ + Call to `_from_sequence` that returns the object unchanged on Exception. + + Parameters + ---------- + cls : class, subclass of ExtensionArray + obj : arraylike + Values to pass to cls._from_sequence + dtype : ExtensionDtype, optional + + Returns + ------- + ExtensionArray or obj + """ + assert isinstance(cls, type), f"must pass a type: {cls}" + assertion_msg = f"must pass a subclass of ExtensionArray: {cls}" + assert issubclass(cls, ABCExtensionArray), assertion_msg + try: + result = cls._from_sequence(obj, dtype=dtype) + except Exception: + # We can't predict what downstream EA constructors may raise + result = obj + return result + + def maybe_upcast_putmask(result: np.ndarray, mask: np.ndarray, other): """ A safe version of putmask that potentially upcasts the result. @@ -783,9 +884,9 @@ def coerce_to_dtypes(result, dtypes): def conv(r, dtype): if np.any(isna(r)): pass - elif dtype == _NS_DTYPE: + elif dtype == DT64NS_DTYPE: r = tslibs.Timestamp(r) - elif dtype == _TD_DTYPE: + elif dtype == TD64NS_DTYPE: r = tslibs.Timedelta(r) elif dtype == np.bool_: # messy. non 0/1 integers do not get converted. @@ -853,7 +954,7 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): raise ValueError("Cannot convert NaT values to integer") return arr.view(dtype) - if dtype not in [_INT64_DTYPE, _TD_DTYPE]: + if dtype not in [_INT64_DTYPE, TD64NS_DTYPE]: # allow frequency conversions # we return a float here! @@ -862,8 +963,8 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): result = arr.astype(dtype).astype(np.float64) result[mask] = np.nan return result - elif dtype == _TD_DTYPE: - return arr.astype(_TD_DTYPE, copy=copy) + elif dtype == TD64NS_DTYPE: + return arr.astype(TD64NS_DTYPE, copy=copy) raise TypeError(f"cannot astype a timedelta from [{arr.dtype}] to [{dtype}]") @@ -888,7 +989,7 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): elif is_timedelta64_dtype(dtype): from pandas import to_timedelta - return astype_nansafe(to_timedelta(arr).values, dtype, copy=copy) + return astype_nansafe(to_timedelta(arr)._values, dtype, copy=copy) if dtype.name in ("datetime64", "timedelta64"): msg = ( @@ -1049,7 +1150,8 @@ def convert_dtypes( dtype new dtype """ - if convert_string or convert_integer or convert_boolean: + is_extension = is_extension_array_dtype(input_array.dtype) + if (convert_string or convert_integer or convert_boolean) and not is_extension: try: inferred_dtype = lib.infer_dtype(input_array) except ValueError: @@ -1062,9 +1164,7 @@ def convert_dtypes( if convert_integer: target_int_dtype = "Int64" - if is_integer_dtype(input_array.dtype) and not is_extension_array_dtype( - input_array.dtype - ): + if is_integer_dtype(input_array.dtype): from pandas.core.arrays.integer import _dtypes inferred_dtype = _dtypes.get(input_array.dtype.name, target_int_dtype) @@ -1078,9 +1178,7 @@ def convert_dtypes( inferred_dtype = input_array.dtype if convert_boolean: - if is_bool_dtype(input_array.dtype) and not is_extension_array_dtype( - input_array.dtype - ): + if is_bool_dtype(input_array.dtype): inferred_dtype = "boolean" else: if isinstance(inferred_dtype, str) and inferred_dtype == "boolean": @@ -1181,9 +1279,11 @@ def try_timedelta(v): from pandas import to_timedelta try: - return to_timedelta(v)._ndarray_values.reshape(shape) + td_values = to_timedelta(v) except ValueError: return v.reshape(shape) + else: + return np.asarray(td_values).reshape(shape) inferred_type = lib.infer_datetimelike_array(ensure_object(v)) @@ -1236,14 +1336,14 @@ def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): f"Please pass in '{dtype.name}[ns]' instead." ) - if is_datetime64 and not is_dtype_equal(dtype, _NS_DTYPE): + if is_datetime64 and not is_dtype_equal(dtype, DT64NS_DTYPE): # pandas supports dtype whose granularity is less than [ns] # e.g., [ps], [fs], [as] if dtype <= np.dtype("M8[ns]"): if dtype.name == "datetime64": raise ValueError(msg) - dtype = _NS_DTYPE + dtype = DT64NS_DTYPE else: raise TypeError(f"cannot convert datetimelike to dtype [{dtype}]") elif is_datetime64tz: @@ -1254,14 +1354,14 @@ def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): if is_scalar(value) and isna(value): value = [value] - elif is_timedelta64 and not is_dtype_equal(dtype, _TD_DTYPE): + elif is_timedelta64 and not is_dtype_equal(dtype, TD64NS_DTYPE): # pandas supports dtype whose granularity is less than [ns] # e.g., [ps], [fs], [as] if dtype <= np.dtype("m8[ns]"): if dtype.name == "timedelta64": raise ValueError(msg) - dtype = _TD_DTYPE + dtype = TD64NS_DTYPE else: raise TypeError(f"cannot convert timedeltalike to dtype [{dtype}]") @@ -1309,8 +1409,8 @@ def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): # coerce datetimelike to object elif is_datetime64_dtype(value) and not is_datetime64_dtype(dtype): if is_object_dtype(dtype): - if value.dtype != _NS_DTYPE: - value = value.astype(_NS_DTYPE) + if value.dtype != DT64NS_DTYPE: + value = value.astype(DT64NS_DTYPE) ints = np.asarray(value).view("i8") return tslib.ints_to_pydatetime(ints) @@ -1326,10 +1426,10 @@ def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): if is_array and value.dtype.kind in ["M", "m"]: dtype = value.dtype - if dtype.kind == "M" and dtype != _NS_DTYPE: + if dtype.kind == "M" and dtype != DT64NS_DTYPE: value = tslibs.conversion.ensure_datetime64ns(value) - elif dtype.kind == "m" and dtype != _TD_DTYPE: + elif dtype.kind == "m" and dtype != TD64NS_DTYPE: value = to_timedelta(value) # only do this if we have an array and the dtype of the array is not @@ -1573,11 +1673,11 @@ def maybe_cast_to_integer_array(arr, dtype, copy: bool = False): casted = np.array(arr, dtype=dtype, copy=copy) else: casted = arr.astype(dtype, copy=copy) - except OverflowError: + except OverflowError as err: raise OverflowError( "The elements provided in the data cannot all be " f"casted to the dtype {dtype}" - ) + ) from err if np.array_equal(arr, casted): return casted diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index c0420244f671e..b2301ab0190c7 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -7,7 +7,7 @@ import numpy as np -from pandas._libs import algos, lib +from pandas._libs import algos from pandas._libs.tslibs import conversion from pandas._typing import ArrayLike, DtypeObj @@ -19,18 +19,12 @@ PeriodDtype, registry, ) -from pandas.core.dtypes.generic import ( - ABCCategorical, - ABCDatetimeIndex, - ABCIndexClass, - ABCPeriodArray, - ABCPeriodIndex, - ABCSeries, -) +from pandas.core.dtypes.generic import ABCCategorical, ABCIndexClass from pandas.core.dtypes.inference import ( # noqa:F401 is_array_like, is_bool, is_complex, + is_dataclass, is_decimal, is_dict_like, is_file_like, @@ -64,8 +58,8 @@ ] } -_NS_DTYPE = conversion.NS_DTYPE -_TD_DTYPE = conversion.TD_DTYPE +DT64NS_DTYPE = conversion.DT64NS_DTYPE +TD64NS_DTYPE = conversion.TD64NS_DTYPE _INT64_DTYPE = np.dtype(np.int64) # oh the troubles to reduce import time @@ -131,7 +125,7 @@ def ensure_categorical(arr): cat_arr : The original array cast as a Categorical. If it already is a Categorical, we return as is. """ - if not is_categorical(arr): + if not is_categorical_dtype(arr.dtype): from pandas import Categorical arr = Categorical(arr) @@ -194,12 +188,14 @@ def ensure_python_int(value: Union[int, np.integer]) -> int: TypeError: if the value isn't an int or can't be converted to one. """ if not is_scalar(value): - raise TypeError(f"Value needs to be a scalar value, was type {type(value)}") + raise TypeError( + f"Value needs to be a scalar value, was type {type(value).__name__}" + ) try: new_value = int(value) assert new_value == value - except (TypeError, ValueError, AssertionError): - raise TypeError(f"Wrong type {type(value)} for value {value}") + except (TypeError, ValueError, AssertionError) as err: + raise TypeError(f"Wrong type {type(value)} for value {value}") from err return new_value @@ -364,6 +360,12 @@ def is_categorical(arr) -> bool: >>> is_categorical(pd.CategoricalIndex([1, 2, 3])) True """ + warnings.warn( + "is_categorical is deprecated and will be removed in a future version. " + "Use is_categorical_dtype instead", + FutureWarning, + stacklevel=2, + ) return isinstance(arr, ABCCategorical) or is_categorical_dtype(arr) @@ -606,71 +608,6 @@ def is_excluded_dtype(dtype) -> bool: return _is_dtype(arr_or_dtype, condition) -def is_period_arraylike(arr) -> bool: - """ - Check whether an array-like is a periodical array-like or PeriodIndex. - - Parameters - ---------- - arr : array-like - The array-like to check. - - Returns - ------- - boolean - Whether or not the array-like is a periodical array-like or - PeriodIndex instance. - - Examples - -------- - >>> is_period_arraylike([1, 2, 3]) - False - >>> is_period_arraylike(pd.Index([1, 2, 3])) - False - >>> is_period_arraylike(pd.PeriodIndex(["2017-01-01"], freq="D")) - True - """ - if isinstance(arr, (ABCPeriodIndex, ABCPeriodArray)): - return True - elif isinstance(arr, (np.ndarray, ABCSeries)): - return is_period_dtype(arr.dtype) - return getattr(arr, "inferred_type", None) == "period" - - -def is_datetime_arraylike(arr) -> bool: - """ - Check whether an array-like is a datetime array-like or DatetimeIndex. - - Parameters - ---------- - arr : array-like - The array-like to check. - - Returns - ------- - boolean - Whether or not the array-like is a datetime array-like or - DatetimeIndex. - - Examples - -------- - >>> is_datetime_arraylike([1, 2, 3]) - False - >>> is_datetime_arraylike(pd.Index([1, 2, 3])) - False - >>> is_datetime_arraylike(pd.DatetimeIndex([1, 2, 3])) - True - """ - if isinstance(arr, ABCDatetimeIndex): - return True - elif isinstance(arr, (np.ndarray, ABCSeries)): - return ( - is_object_dtype(arr.dtype) - and lib.infer_dtype(arr, skipna=False) == "datetime" - ) - return getattr(arr, "inferred_type", None) == "datetime" - - def is_dtype_equal(source, target) -> bool: """ Check if two dtypes are equal. @@ -1050,7 +987,7 @@ def is_datetime64_ns_dtype(arr_or_dtype) -> bool: tipo = _get_dtype(arr_or_dtype.dtype) else: return False - return tipo == _NS_DTYPE or getattr(tipo, "base", None) == _NS_DTYPE + return tipo == DT64NS_DTYPE or getattr(tipo, "base", None) == DT64NS_DTYPE def is_timedelta64_ns_dtype(arr_or_dtype) -> bool: @@ -1081,7 +1018,7 @@ def is_timedelta64_ns_dtype(arr_or_dtype) -> bool: >>> is_timedelta64_ns_dtype(np.array([1, 2], dtype=np.timedelta64)) False """ - return _is_dtype(arr_or_dtype, lambda dtype: dtype == _TD_DTYPE) + return _is_dtype(arr_or_dtype, lambda dtype: dtype == TD64NS_DTYPE) def is_datetime_or_timedelta_dtype(arr_or_dtype) -> bool: @@ -1122,23 +1059,6 @@ def is_datetime_or_timedelta_dtype(arr_or_dtype) -> bool: return _is_dtype_type(arr_or_dtype, classes(np.datetime64, np.timedelta64)) -def _is_unorderable_exception(e: TypeError) -> bool: - """ - Check if the exception raised is an unorderable exception. - - Parameters - ---------- - e : Exception or sub-class - The exception object to check. - - Returns - ------- - bool - Whether or not the exception raised is an unorderable exception. - """ - return "'>' not supported between instances of" in str(e) - - # This exists to silence numpy deprecation warnings, see GH#29553 def is_numeric_v_string_like(a, b): """ @@ -1527,7 +1447,7 @@ def is_extension_type(arr) -> bool: stacklevel=2, ) - if is_categorical(arr): + if is_categorical_dtype(arr): return True elif is_sparse(arr): return True @@ -1801,7 +1721,7 @@ def _validate_date_like_dtype(dtype) -> None: try: typ = np.datetime_data(dtype)[0] except ValueError as e: - raise TypeError(e) + raise TypeError(e) from e if typ != "generic" and typ != "ns": raise ValueError( f"{repr(dtype.name)} is too specific of a frequency, " @@ -1840,9 +1760,9 @@ def pandas_dtype(dtype) -> DtypeObj: # raise a consistent TypeError if failed try: npdtype = np.dtype(dtype) - except SyntaxError: + except SyntaxError as err: # np.dtype uses `eval` which can raise SyntaxError - raise TypeError(f"data type '{dtype}' not understood") + raise TypeError(f"data type '{dtype}' not understood") from err # Any invalid dtype (such as pd.Timestamp) should raise an error. # np.dtype(invalid_type).kind = 0 for such objects. However, this will diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 49034616b374a..301c9bb7b3f5c 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -7,8 +7,8 @@ from pandas._libs import tslib, tslibs from pandas.core.dtypes.common import ( - _NS_DTYPE, - _TD_DTYPE, + DT64NS_DTYPE, + TD64NS_DTYPE, is_bool_dtype, is_categorical_dtype, is_datetime64_dtype, @@ -97,6 +97,9 @@ def is_nonempty(x) -> bool: # Creating an empty array directly is tempting, but the winnings would be # marginal given that it would still require shape & dtype calculation and # np.concatenate which has them both implemented is compiled. + non_empties = [x for x in to_concat if is_nonempty(x)] + if non_empties and axis == 0: + to_concat = non_empties typs = get_dtype_kinds(to_concat) _contains_datetime = any(typ.startswith("datetime") for typ in typs) @@ -114,10 +117,17 @@ def is_nonempty(x) -> bool: elif "sparse" in typs: return _concat_sparse(to_concat, axis=axis, typs=typs) - all_empty = all(not is_nonempty(x) for x in to_concat) - if any(is_extension_array_dtype(x) for x in to_concat) and axis == 1: + all_empty = not len(non_empties) + single_dtype = len({x.dtype for x in to_concat}) == 1 + any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat) + + if any_ea and axis == 1: to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat] + elif any_ea and single_dtype and axis == 0: + cls = type(to_concat[0]) + return cls._concat_same_type(to_concat) + if all_empty: # we have all empties, but may need to coerce the result dtype to # object if we have non-numeric type operands (numpy would otherwise @@ -283,7 +293,7 @@ def union_categoricals( Categories (3, object): [b, c, a] """ from pandas import Index, Categorical - from pandas.core.arrays.categorical import _recode_for_categories + from pandas.core.arrays.categorical import recode_for_categories if len(to_union) == 0: raise ValueError("No Categoricals to union") @@ -315,7 +325,7 @@ def _maybe_unwrap(x): new_codes = np.concatenate([c.codes for c in to_union]) else: codes = [first.codes] + [ - _recode_for_categories(other.codes, other.categories, first.categories) + recode_for_categories(other.codes, other.categories, first.categories) for other in to_union[1:] ] new_codes = np.concatenate(codes) @@ -338,7 +348,7 @@ def _maybe_unwrap(x): categories = categories.sort_values() new_codes = [ - _recode_for_categories(c.codes, c.categories, categories) for c in to_union + recode_for_categories(c.codes, c.categories, categories) for c in to_union ] new_codes = np.concatenate(new_codes) else: @@ -391,7 +401,7 @@ def concat_datetime(to_concat, axis=0, typs=None): if "datetime" in typs: to_concat = [x.astype(np.int64, copy=False) for x in to_concat] - return _concatenate_2d(to_concat, axis=axis).view(_NS_DTYPE) + return _concatenate_2d(to_concat, axis=axis).view(DT64NS_DTYPE) else: # when to_concat has different tz, len(typs) > 1. # thus no need to care @@ -399,7 +409,7 @@ def concat_datetime(to_concat, axis=0, typs=None): elif "timedelta" in typs: return _concatenate_2d([x.view(np.int64) for x in to_concat], axis=axis).view( - _TD_DTYPE + TD64NS_DTYPE ) elif any(typ.startswith("period") for typ in typs): @@ -413,7 +423,7 @@ def _convert_datetimelike_to_object(x): # coerce datetimelike array to object dtype # if dtype is of datetimetz or timezone - if x.dtype.kind == _NS_DTYPE.kind: + if x.dtype.kind == DT64NS_DTYPE.kind: if getattr(x, "tz", None) is not None: x = np.asarray(x.astype(object)) else: @@ -421,7 +431,7 @@ def _convert_datetimelike_to_object(x): x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(), box="timestamp") x = x.reshape(shape) - elif x.dtype == _TD_DTYPE: + elif x.dtype == TD64NS_DTYPE: shape = x.shape x = tslibs.ints_to_pytimedelta(x.view(np.int64).ravel(), box=True) x = x.reshape(shape) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 0730de934b56c..8fe2b3c60d6d0 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -3,7 +3,18 @@ """ import re -from typing import Any, Dict, List, MutableMapping, Optional, Tuple, Type, Union, cast +from typing import ( + TYPE_CHECKING, + Any, + Dict, + List, + MutableMapping, + Optional, + Tuple, + Type, + Union, + cast, +) import numpy as np import pytz @@ -16,6 +27,15 @@ from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCDateOffset, ABCIndexClass from pandas.core.dtypes.inference import is_bool, is_list_like +if TYPE_CHECKING: + import pyarrow # noqa: F401 + from pandas.core.arrays import ( # noqa: F401 + IntervalArray, + PeriodArray, + DatetimeArray, + ) + from pandas import Categorical # noqa: F401 + str_type = str @@ -68,7 +88,7 @@ def register(self, dtype: Type[ExtensionDtype]) -> None: """ Parameters ---------- - dtype : ExtensionDtype + dtype : ExtensionDtype class """ if not issubclass(dtype, ExtensionDtype): raise ValueError("can only register pandas extension dtypes") @@ -122,7 +142,7 @@ class PandasExtensionDtype(ExtensionDtype): # and ExtensionDtype's @properties in the subclasses below. The kind and # type variables in those subclasses are explicitly typed below. subdtype = None - str: Optional[str_type] = None + str: str_type num = 100 shape: Tuple[int, ...] = tuple() itemsize = 8 @@ -169,12 +189,12 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): """ Type for categorical data with the categories and orderedness. - .. versionchanged:: 0.21.0 - Parameters ---------- categories : sequence, optional Must be unique, and must not contain any nulls. + The categories are stored in an Index, + and if an index is provided the dtype of that index will be used. ordered : bool or None, default False Whether or not this categorical is treated as a ordered categorical. None can be used to maintain the ordered value of existing categoricals when @@ -192,7 +212,7 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): See Also -------- - Categorical + Categorical : Represent a categorical variable in classic R / S-plus fashion. Notes ----- @@ -210,6 +230,12 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): 3 NaN dtype: category Categories (2, object): [b < a] + + An empty CategoricalDtype with a specific dtype can be created + by providing an empty index. As follows, + + >>> pd.CategoricalDtype(pd.DatetimeIndex([])).categories.dtype + dtype('>> pd.CategoricalDtype._from_values_or_dtype(c, dtype=dtype2) CategoricalDtype(categories=['x', 'y'], ordered=False) """ - from pandas.core.dtypes.common import is_categorical if dtype is not None: # The dtype argument takes precedence over values.dtype (if any) @@ -326,7 +351,7 @@ def _from_values_or_dtype( ) elif not isinstance(dtype, CategoricalDtype): raise ValueError(f"Cannot not construct CategoricalDtype from {dtype}") - elif is_categorical(values): + elif cls.is_dtype(values): # If no "dtype" was passed, use the one from "values", but honor # the "ordered" and "categories" arguments dtype = values.dtype._from_categorical_dtype( @@ -459,7 +484,7 @@ def _hash_categories(categories, ordered: Ordered = True) -> int: _combine_hash_arrays, hash_tuples, ) - from pandas.core.dtypes.common import is_datetime64tz_dtype, _NS_DTYPE + from pandas.core.dtypes.common import is_datetime64tz_dtype, DT64NS_DTYPE if len(categories) and isinstance(categories[0], tuple): # assumes if any individual category is a tuple, then all our. ATM @@ -479,7 +504,7 @@ def _hash_categories(categories, ordered: Ordered = True) -> int: if is_datetime64tz_dtype(categories.dtype): # Avoid future warning. - categories = categories.astype(_NS_DTYPE) + categories = categories.astype(DT64NS_DTYPE) cat_array = hash_array(np.asarray(categories), categorize=False) if ordered: @@ -492,7 +517,7 @@ def _hash_categories(categories, ordered: Ordered = True) -> int: return np.bitwise_xor.reduce(hashed) @classmethod - def construct_array_type(cls): + def construct_array_type(cls) -> Type["Categorical"]: """ Return the array type associated with this dtype. @@ -500,7 +525,7 @@ def construct_array_type(cls): ------- type """ - from pandas import Categorical + from pandas import Categorical # noqa: F811 return Categorical @@ -550,7 +575,7 @@ def validate_categories(categories, fastpath: bool = False): if not fastpath: if categories.hasnans: - raise ValueError("Categorial categories cannot be null") + raise ValueError("Categorical categories cannot be null") if not categories.is_unique: raise ValueError("Categorical categories must be unique") @@ -664,9 +689,9 @@ class DatetimeTZDtype(PandasExtensionDtype): _match = re.compile(r"(datetime64|M8)\[(?P.+), (?P.+)\]") _cache: Dict[str_type, PandasExtensionDtype] = {} - def __init__(self, unit="ns", tz=None): + def __init__(self, unit: Union[str_type, "DatetimeTZDtype"] = "ns", tz=None): if isinstance(unit, DatetimeTZDtype): - unit, tz = unit.unit, unit.tz + unit, tz = unit.unit, unit.tz # type: ignore if unit != "ns": if isinstance(unit, str) and tz is None: @@ -696,7 +721,7 @@ def __init__(self, unit="ns", tz=None): self._tz = tz @property - def unit(self): + def unit(self) -> str_type: """ The precision of the datetime data. """ @@ -710,7 +735,7 @@ def tz(self): return self._tz @classmethod - def construct_array_type(cls): + def construct_array_type(cls) -> Type["DatetimeArray"]: """ Return the array type associated with this dtype. @@ -718,12 +743,12 @@ def construct_array_type(cls): ------- type """ - from pandas.core.arrays import DatetimeArray + from pandas.core.arrays import DatetimeArray # noqa: F811 return DatetimeArray @classmethod - def construct_from_string(cls, string: str_type): + def construct_from_string(cls, string: str_type) -> "DatetimeTZDtype": """ Construct a DatetimeTZDtype from a string. @@ -781,7 +806,7 @@ def __eq__(self, other: Any) -> bool: and str(self.tz) == str(other.tz) ) - def __setstate__(self, state): + def __setstate__(self, state) -> None: # for pickle compat. __get_state__ is defined in the # PandasExtensionDtype superclass and uses the public properties to # pickle -> need to set the settable private ones here (see GH26067) @@ -876,7 +901,7 @@ def _parse_dtype_strict(cls, freq): raise ValueError("could not construct PeriodDtype") @classmethod - def construct_from_string(cls, string): + def construct_from_string(cls, string: str_type) -> "PeriodDtype": """ Strict construction from a string, raise a TypeError if not possible @@ -926,7 +951,7 @@ def __setstate__(self, state): self._freq = state["freq"] @classmethod - def is_dtype(cls, dtype) -> bool: + def is_dtype(cls, dtype: object) -> bool: """ Return a boolean if we if the passed type is an actual dtype that we can match (via string or type) @@ -947,7 +972,7 @@ def is_dtype(cls, dtype) -> bool: return super().is_dtype(dtype) @classmethod - def construct_array_type(cls): + def construct_array_type(cls) -> Type["PeriodArray"]: """ Return the array type associated with this dtype. @@ -959,9 +984,13 @@ def construct_array_type(cls): return PeriodArray - def __from_arrow__(self, array): - """Construct PeriodArray from pyarrow Array/ChunkedArray.""" - import pyarrow + def __from_arrow__( + self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"] + ) -> "PeriodArray": + """ + Construct PeriodArray from pyarrow Array/ChunkedArray. + """ + import pyarrow # noqa: F811 from pandas.core.arrays import PeriodArray from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask @@ -1040,8 +1069,8 @@ def __new__(cls, subtype=None): try: subtype = pandas_dtype(subtype) - except TypeError: - raise TypeError("could not construct IntervalDtype") + except TypeError as err: + raise TypeError("could not construct IntervalDtype") from err if is_categorical_dtype(subtype) or is_string_dtype(subtype): # GH 19016 @@ -1067,7 +1096,7 @@ def subtype(self): return self._subtype @classmethod - def construct_array_type(cls): + def construct_array_type(cls) -> Type["IntervalArray"]: """ Return the array type associated with this dtype. @@ -1134,7 +1163,7 @@ def __setstate__(self, state): self._subtype = state["subtype"] @classmethod - def is_dtype(cls, dtype) -> bool: + def is_dtype(cls, dtype: object) -> bool: """ Return a boolean if we if the passed type is an actual dtype that we can match (via string or type) @@ -1152,9 +1181,13 @@ def is_dtype(cls, dtype) -> bool: return False return super().is_dtype(dtype) - def __from_arrow__(self, array): - """Construct IntervalArray from pyarrow Array/ChunkedArray.""" - import pyarrow + def __from_arrow__( + self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"] + ) -> "IntervalArray": + """ + Construct IntervalArray from pyarrow Array/ChunkedArray. + """ + import pyarrow # noqa: F811 from pandas.core.arrays import IntervalArray if isinstance(array, pyarrow.Array): diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index 435d80b2c4dfb..7f98ca3d402bc 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -63,20 +63,11 @@ def _check(cls, inst) -> bool: "ABCTimedeltaArray", "_typ", ("timedeltaarray") ) ABCPeriodArray = create_pandas_abc_type("ABCPeriodArray", "_typ", ("periodarray",)) -ABCPeriod = create_pandas_abc_type("ABCPeriod", "_typ", ("period",)) ABCDateOffset = create_pandas_abc_type("ABCDateOffset", "_typ", ("dateoffset",)) -ABCInterval = create_pandas_abc_type("ABCInterval", "_typ", ("interval",)) ABCExtensionArray = create_pandas_abc_type( "ABCExtensionArray", "_typ", + # Note: IntervalArray and SparseArray are included bc they have _typ="extension" ("extension", "categorical", "periodarray", "datetimearray", "timedeltaarray"), ) ABCPandasArray = create_pandas_abc_type("ABCPandasArray", "_typ", ("npy_extension",)) - - -class _ABCGeneric(type): - def __instancecheck__(cls, inst) -> bool: - return hasattr(inst, "_data") - - -ABCGeneric = _ABCGeneric("ABCGeneric", tuple(), {}) diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 56b880dca1241..d1607b5ede6c3 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -386,3 +386,39 @@ def is_sequence(obj) -> bool: return not isinstance(obj, (str, bytes)) except (TypeError, AttributeError): return False + + +def is_dataclass(item): + """ + Checks if the object is a data-class instance + + Parameters + ---------- + item : object + + Returns + -------- + is_dataclass : bool + True if the item is an instance of a data-class, + will return false if you pass the data class itself + + Examples + -------- + >>> from dataclasses import dataclass + >>> @dataclass + ... class Point: + ... x: int + ... y: int + + >>> is_dataclass(Point) + False + >>> is_dataclass(Point(0,2)) + True + + """ + try: + from dataclasses import is_dataclass + + return is_dataclass(item) and not isinstance(item, type) + except ImportError: + return False diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index ee74b02af9516..08a6d42042c1c 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -11,35 +11,29 @@ from pandas._typing import DtypeObj from pandas.core.dtypes.common import ( - _NS_DTYPE, - _TD_DTYPE, + DT64NS_DTYPE, + TD64NS_DTYPE, ensure_object, is_bool_dtype, is_complex_dtype, - is_datetime64_dtype, - is_datetime64tz_dtype, is_datetimelike_v_numeric, is_dtype_equal, is_extension_array_dtype, is_float_dtype, is_integer_dtype, is_object_dtype, - is_period_dtype, is_scalar, is_string_dtype, is_string_like_dtype, - is_timedelta64_dtype, needs_i8_conversion, pandas_dtype, ) from pandas.core.dtypes.generic import ( - ABCDatetimeArray, + ABCDataFrame, ABCExtensionArray, - ABCGeneric, ABCIndexClass, ABCMultiIndex, ABCSeries, - ABCTimedeltaArray, ) from pandas.core.dtypes.inference import is_list_like @@ -139,26 +133,16 @@ def _isna_new(obj): raise NotImplementedError("isna is not defined for MultiIndex") elif isinstance(obj, type): return False - elif isinstance( - obj, - ( - ABCSeries, - np.ndarray, - ABCIndexClass, - ABCExtensionArray, - ABCDatetimeArray, - ABCTimedeltaArray, - ), - ): + elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass, ABCExtensionArray)): return _isna_ndarraylike(obj) - elif isinstance(obj, ABCGeneric): - return obj._constructor(obj._data.isna(func=isna)) + elif isinstance(obj, ABCDataFrame): + return obj.isna() elif isinstance(obj, list): return _isna_ndarraylike(np.asarray(obj, dtype=object)) elif hasattr(obj, "__array__"): return _isna_ndarraylike(np.asarray(obj)) else: - return obj is None + return False def _isna_old(obj): @@ -182,14 +166,14 @@ def _isna_old(obj): return False elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass, ABCExtensionArray)): return _isna_ndarraylike_old(obj) - elif isinstance(obj, ABCGeneric): - return obj._constructor(obj._data.isna(func=_isna_old)) + elif isinstance(obj, ABCDataFrame): + return obj.isna() elif isinstance(obj, list): return _isna_ndarraylike_old(np.asarray(obj, dtype=object)) elif hasattr(obj, "__array__"): return _isna_ndarraylike_old(np.asarray(obj)) else: - return obj is None + return False _isna = _isna_new @@ -224,37 +208,14 @@ def _use_inf_as_na(key): def _isna_ndarraylike(obj): - is_extension = is_extension_array_dtype(obj) - - if not is_extension: - # Avoid accessing `.values` on things like - # PeriodIndex, which may be expensive. - values = getattr(obj, "values", obj) - else: - values = obj - + is_extension = is_extension_array_dtype(obj.dtype) + values = getattr(obj, "_values", obj) dtype = values.dtype if is_extension: - if isinstance(obj, (ABCIndexClass, ABCSeries)): - values = obj._values - else: - values = obj result = values.isna() - elif isinstance(obj, ABCDatetimeArray): - return obj.isna() elif is_string_dtype(dtype): - # Working around NumPy ticket 1542 - shape = values.shape - - if is_string_like_dtype(dtype): - # object array of strings - result = np.zeros(values.shape, dtype=bool) - else: - # object array of non-strings - result = np.empty(shape, dtype=bool) - vec = libmissing.isnaobj(values.ravel()) - result[...] = vec.reshape(shape) + result = _isna_string_dtype(values, dtype, old=False) elif needs_i8_conversion(dtype): # this is the NaT pattern @@ -270,21 +231,13 @@ def _isna_ndarraylike(obj): def _isna_ndarraylike_old(obj): - values = getattr(obj, "values", obj) + values = getattr(obj, "_values", obj) dtype = values.dtype if is_string_dtype(dtype): - # Working around NumPy ticket 1542 - shape = values.shape + result = _isna_string_dtype(values, dtype, old=True) - if is_string_like_dtype(dtype): - result = np.zeros(values.shape, dtype=bool) - else: - result = np.empty(shape, dtype=bool) - vec = libmissing.isnaobj_old(values.ravel()) - result[:] = vec.reshape(shape) - - elif is_datetime64_dtype(dtype): + elif needs_i8_conversion(dtype): # this is the NaT pattern result = values.view("i8") == iNaT else: @@ -297,6 +250,24 @@ def _isna_ndarraylike_old(obj): return result +def _isna_string_dtype(values: np.ndarray, dtype: np.dtype, old: bool) -> np.ndarray: + # Working around NumPy ticket 1542 + shape = values.shape + + if is_string_like_dtype(dtype): + result = np.zeros(values.shape, dtype=bool) + else: + result = np.empty(shape, dtype=bool) + if old: + vec = libmissing.isnaobj_old(values.ravel()) + else: + vec = libmissing.isnaobj(values.ravel()) + + result[...] = vec.reshape(shape) + + return result + + def notna(obj): """ Detect non-missing values for an array-like object. @@ -511,9 +482,9 @@ def _infer_fill_value(val): elif is_object_dtype(val.dtype): dtype = lib.infer_dtype(ensure_object(val), skipna=False) if dtype in ["datetime", "datetime64"]: - return np.array("NaT", dtype=_NS_DTYPE) + return np.array("NaT", dtype=DT64NS_DTYPE) elif dtype in ["timedelta", "timedelta64"]: - return np.array("NaT", dtype=_TD_DTYPE) + return np.array("NaT", dtype=TD64NS_DTYPE) return np.nan @@ -556,12 +527,7 @@ def na_value_for_dtype(dtype, compat: bool = True): if is_extension_array_dtype(dtype): return dtype.na_value - if ( - is_datetime64_dtype(dtype) - or is_datetime64tz_dtype(dtype) - or is_timedelta64_dtype(dtype) - or is_period_dtype(dtype) - ): + if needs_i8_conversion(dtype): return NaT elif is_float_dtype(dtype): return np.nan @@ -581,7 +547,7 @@ def remove_na_arraylike(arr): if is_extension_array_dtype(arr): return arr[notna(arr)] else: - return arr[notna(lib.values_from_object(arr))] + return arr[notna(np.asarray(arr))] def is_valid_nat_for_dtype(obj, dtype: DtypeObj) -> bool: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3fc10444ee064..d19f1a263f71a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -23,6 +23,7 @@ FrozenSet, Hashable, Iterable, + Iterator, List, Optional, Sequence, @@ -40,7 +41,16 @@ from pandas._config import get_option from pandas._libs import algos as libalgos, lib, properties -from pandas._typing import Axes, Axis, Dtype, FilePathOrBuffer, Label, Level, Renamer +from pandas._typing import ( + ArrayLike, + Axes, + Axis, + Dtype, + FilePathOrBuffer, + Label, + Level, + Renamer, +) from pandas.compat import PY37 from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv @@ -77,6 +87,8 @@ ensure_platform_int, infer_dtype_from_object, is_bool_dtype, + is_dataclass, + is_datetime64_any_dtype, is_dict_like, is_dtype_equal, is_extension_array_dtype, @@ -88,16 +100,15 @@ is_list_like, is_named_tuple, is_object_dtype, + is_period_dtype, is_scalar, is_sequence, needs_i8_conversion, ) from pandas.core.dtypes.generic import ( ABCDataFrame, - ABCDatetimeIndex, ABCIndexClass, ABCMultiIndex, - ABCPeriodIndex, ABCSeries, ) from pandas.core.dtypes.missing import isna, notna @@ -117,6 +128,7 @@ from pandas.core.internals import BlockManager from pandas.core.internals.construction import ( arrays_to_mgr, + dataclasses_to_dicts, get_names_from_index, init_dict, init_ndarray, @@ -243,8 +255,6 @@ dataset. * "many_to_many" or "m:m": allowed, but does not result in checks. - .. versionadded:: 0.21.0 - Returns ------- DataFrame @@ -358,9 +368,9 @@ class DataFrame(NDFrame): -------- DataFrame.from_records : Constructor from tuples, also record arrays. DataFrame.from_dict : From dicts of Series, arrays, or dicts. - read_csv - read_table - read_clipboard + read_csv : Read a comma-separated values (csv) file into DataFrame. + read_table : Read general delimited file into DataFrame. + read_clipboard : Read text from clipboard into DataFrame. Examples -------- @@ -431,9 +441,14 @@ def __init__( dtype = self._validate_dtype(dtype) if isinstance(data, DataFrame): - data = data._data + data = data._mgr if isinstance(data, BlockManager): + if index is None and columns is None and dtype is None and copy is False: + # GH#33357 fastpath + NDFrame.__init__(self, data) + return + mgr = self._init_mgr( data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy ) @@ -474,6 +489,8 @@ def __init__( if not isinstance(data, (abc.Sequence, ExtensionArray)): data = list(data) if len(data) > 0: + if is_dataclass(data[0]): + data = dataclasses_to_dicts(data) if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1: if is_named_tuple(data[0]) and columns is None: columns = data[0]._fields @@ -588,10 +605,20 @@ def _is_homogeneous_type(self) -> bool: ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type False """ - if self._data.any_extension_types: - return len({block.dtype for block in self._data.blocks}) == 1 + if self._mgr.any_extension_types: + return len({block.dtype for block in self._mgr.blocks}) == 1 else: - return not self._data.is_mixed_type + return not self._mgr.is_mixed_type + + @property + def _can_fast_transpose(self) -> bool: + """ + Can we transpose this DataFrame without creating any new array objects. + """ + if self._data.any_extension_types: + # TODO(EA2D) special case would be unnecessary with 2D EAs + return False + return len(self._data.blocks) == 1 # ---------------------------------------------------------------------- # Rendering Methods @@ -758,8 +785,8 @@ def to_string( header: Union[bool, Sequence[str]] = True, index: bool = True, na_rep: str = "NaN", - formatters: Optional[fmt.formatters_type] = None, - float_format: Optional[fmt.float_format_type] = None, + formatters: Optional[fmt.FormattersType] = None, + float_format: Optional[fmt.FloatFormatType] = None, sparsify: Optional[bool] = None, index_names: bool = True, justify: Optional[str] = None, @@ -832,11 +859,11 @@ def style(self) -> "Styler": Returns a Styler object. Contains methods for building a styled HTML representation of the DataFrame. - a styled HTML representation fo the DataFrame. See Also -------- - io.formats.style.Styler + io.formats.style.Styler : Helps style a DataFrame or Series according to the + data with HTML and CSS. """ from pandas.io.formats.style import Styler @@ -893,7 +920,7 @@ def style(self) -> "Styler": """ @Appender(_shared_docs["items"]) - def items(self) -> Iterable[Tuple[Optional[Hashable], Series]]: + def items(self) -> Iterable[Tuple[Label, Series]]: if self.columns.is_unique and hasattr(self, "_item_cache"): for k in self.columns: yield k, self._get_item_cache(k) @@ -902,10 +929,10 @@ def items(self) -> Iterable[Tuple[Optional[Hashable], Series]]: yield k, self._ixs(i, axis=1) @Appender(_shared_docs["items"]) - def iteritems(self) -> Iterable[Tuple[Optional[Hashable], Series]]: + def iteritems(self) -> Iterable[Tuple[Label, Series]]: yield from self.items() - def iterrows(self) -> Iterable[Tuple[Optional[Hashable], Series]]: + def iterrows(self) -> Iterable[Tuple[Label, Series]]: """ Iterate over DataFrame rows as (index, Series) pairs. @@ -1062,7 +1089,7 @@ def dot(self, other): ------- Series or DataFrame If other is a Series, return the matrix product between self and - other as a Serie. If other is a DataFrame or a numpy.array, return + other as a Series. If other is a DataFrame or a numpy.array, return the matrix product of self and other in a DataFrame of a np.array. See Also @@ -1250,7 +1277,7 @@ def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFra return cls(data, index=index, columns=columns, dtype=dtype) - def to_numpy(self, dtype=None, copy=False) -> np.ndarray: + def to_numpy(self, dtype=None, copy: bool = False) -> np.ndarray: """ Convert the DataFrame to a NumPy array. @@ -1335,8 +1362,6 @@ def to_dict(self, orient="dict", into=dict): instance of the mapping type you want. If you want a collections.defaultdict, you must pass it initialized. - .. versionadded:: 0.21.0 - Returns ------- dict, list or collections.abc.Mapping @@ -1402,11 +1427,45 @@ def to_dict(self, orient="dict", into=dict): ) # GH16122 into_c = com.standardize_mapping(into) - if orient.lower().startswith("d"): + + orient = orient.lower() + # GH32515 + if orient.startswith(("d", "l", "s", "r", "i")) and orient not in { + "dict", + "list", + "series", + "split", + "records", + "index", + }: + warnings.warn( + "Using short name for 'orient' is deprecated. Only the " + "options: ('dict', list, 'series', 'split', 'records', 'index') " + "will be used in a future version. Use one of the above " + "to silence this warning.", + FutureWarning, + ) + + if orient.startswith("d"): + orient = "dict" + elif orient.startswith("l"): + orient = "list" + elif orient.startswith("sp"): + orient = "split" + elif orient.startswith("s"): + orient = "series" + elif orient.startswith("r"): + orient = "records" + elif orient.startswith("i"): + orient = "index" + + if orient == "dict": return into_c((k, v.to_dict(into)) for k, v in self.items()) - elif orient.lower().startswith("l"): + + elif orient == "list": return into_c((k, v.tolist()) for k, v in self.items()) - elif orient.lower().startswith("sp"): + + elif orient == "split": return into_c( ( ("index", self.index.tolist()), @@ -1420,9 +1479,11 @@ def to_dict(self, orient="dict", into=dict): ), ) ) - elif orient.lower().startswith("s"): + + elif orient == "series": return into_c((k, com.maybe_box_datetimelike(v)) for k, v in self.items()) - elif orient.lower().startswith("r"): + + elif orient == "records": columns = self.columns.tolist() rows = ( dict(zip(columns, row)) @@ -1432,13 +1493,15 @@ def to_dict(self, orient="dict", into=dict): into_c((k, com.maybe_box_datetimelike(v)) for k, v in row.items()) for row in rows ] - elif orient.lower().startswith("i"): + + elif orient == "index": if not self.index.is_unique: raise ValueError("DataFrame index must be unique for orient='index'.") return into_c( (t[0], dict(zip(self.columns, t[1:]))) for t in self.itertuples(name=None) ) + else: raise ValueError(f"orient '{orient}' not understood") @@ -1774,7 +1837,9 @@ def to_records( else: ix_vals = [self.index.values] - arrays = ix_vals + [self[c]._internal_get_values() for c in self.columns] + arrays = ix_vals + [ + np.asarray(self.iloc[:, i]) for i in range(len(self.columns)) + ] count = 0 index_names = list(self.index.names) @@ -1789,7 +1854,7 @@ def to_records( names = [str(name) for name in itertools.chain(index_names, self.columns)] else: - arrays = [self[c]._internal_get_values() for c in self.columns] + arrays = [np.asarray(self.iloc[:, i]) for i in range(len(self.columns))] names = [str(c) for c in self.columns] index_names = [] @@ -1846,8 +1911,41 @@ def to_records( return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats}) @classmethod - def _from_arrays(cls, arrays, columns, index, dtype=None) -> "DataFrame": - mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) + def _from_arrays( + cls, arrays, columns, index, dtype=None, verify_integrity=True + ) -> "DataFrame": + """ + Create DataFrame from a list of arrays corresponding to the columns. + + Parameters + ---------- + arrays : list-like of arrays + Each array in the list corresponds to one column, in order. + columns : list-like, Index + The column names for the resulting DataFrame. + index : list-like, Index + The rows labels for the resulting DataFrame. + dtype : dtype, optional + Optional dtype to enforce for all arrays. + verify_integrity : bool, default True + Validate and homogenize all input. If set to False, it is assumed + that all elements of `arrays` are actual arrays how they will be + stored in a block (numpy ndarray or ExtensionArray), have the same + length as and are aligned with the index, and that `columns` and + `index` are ensured to be an Index object. + + Returns + ------- + DataFrame + """ + mgr = arrays_to_mgr( + arrays, + columns, + index, + columns, + dtype=dtype, + verify_integrity=verify_integrity, + ) return cls(mgr) @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") @@ -1985,18 +2083,24 @@ def to_stata( writer.write_file() @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") - def to_feather(self, path) -> None: + def to_feather(self, path, **kwargs) -> None: """ - Write out the binary feather-format for DataFrames. + Write a DataFrame to the binary Feather format. Parameters ---------- path : str String file path. + **kwargs : + Additional keywords passed to :func:`pyarrow.feather.write_feather`. + Starting with pyarrow 0.17, this includes the `compression`, + `compression_level`, `chunksize` and `version` keywords. + + .. versionadded:: 1.1.0 """ from pandas.io.feather_format import to_feather - to_feather(self, path) + to_feather(self, path, **kwargs) @Appender( """ @@ -2041,8 +2145,6 @@ def to_parquet( """ Write a DataFrame to the binary parquet format. - .. versionadded:: 0.21.0 - This function writes the dataframe as a `parquet file `_. You can choose different parquet backends, and have the option of compression. See @@ -2224,7 +2326,7 @@ def to_html( ) # ---------------------------------------------------------------------- - @Appender(info.__doc__) + @doc(info) def info( self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None ) -> None: @@ -2444,9 +2546,11 @@ def transpose(self, *args, copy: bool = False) -> "DataFrame": new_values, index=self.columns, columns=self.index ) - return result.__finalize__(self) + return result.__finalize__(self, method="transpose") - T = property(transpose) + @property + def T(self) -> "DataFrame": + return self.transpose() # ---------------------------------------------------------------------- # Indexing Methods @@ -2464,7 +2568,7 @@ def _ixs(self, i: int, axis: int = 0): """ # irow if axis == 0: - new_values = self._data.fast_xs(i) + new_values = self._mgr.fast_xs(i) # if we are a copy, mark as such copy = isinstance(new_values, np.ndarray) and new_values.base is None @@ -2481,7 +2585,7 @@ def _ixs(self, i: int, axis: int = 0): else: label = self.columns[i] - values = self._data.iget(i) + values = self._mgr.iget(i) result = self._box_col_values(values, label) # this is a cached value, mark it so @@ -2489,6 +2593,21 @@ def _ixs(self, i: int, axis: int = 0): return result + def _get_column_array(self, i: int) -> ArrayLike: + """ + Get the values of the i'th column (ndarray or ExtensionArray, as stored + in the Block) + """ + return self._data.iget_values(i) + + def _iter_column_arrays(self) -> Iterator[ArrayLike]: + """ + Iterate over the arrays of all columns in order. + This returns the values as stored in the Block (ndarray or ExtensionArray). + """ + for i in range(len(self.columns)): + yield self._get_column_array(i) + def __getitem__(self, key): key = lib.item_from_zerodim(key) key = com.apply_if_callable(key, self) @@ -2686,6 +2805,7 @@ def _setitem_array(self, key, value): for k1, k2 in zip(key, value.columns): self[k1] = value[k2] else: + self.loc._ensure_listlike_indexer(key, axis=1) indexer = self.loc._get_listlike_indexer( key, axis=1, raise_missing=False )[1] @@ -2709,6 +2829,20 @@ def _setitem_frame(self, key, value): self._check_setitem_copy() self._where(-key, value, inplace=True) + def _iset_item(self, loc: int, value): + self._ensure_valid_index(value) + + # technically _sanitize_column expects a label, not a position, + # but the behavior is the same as long as we pass broadcast=False + value = self._sanitize_column(loc, value, broadcast=False) + NDFrame._iset_item(self, loc, value) + + # check if we are modifying a copy + # try to set first as we want an invalid + # value exception to occur first + if len(self): + self._check_setitem_copy() + def _set_item(self, key, value): """ Add series to DataFrame in specified column. @@ -2742,7 +2876,7 @@ def _set_value(self, index, col, value, takeable: bool = False): """ try: if takeable is True: - series = self._iget_item_cache(col) + series = self._ixs(col, axis=1) series._set_value(index, value, takeable=True) return @@ -2771,13 +2905,13 @@ def _ensure_valid_index(self, value): if not len(self.index) and is_list_like(value) and len(value): try: value = Series(value) - except (ValueError, NotImplementedError, TypeError): + except (ValueError, NotImplementedError, TypeError) as err: raise ValueError( "Cannot set a frame with no defined index " "and a value that cannot be converted to a Series" - ) + ) from err - self._data = self._data.reindex_axis( + self._mgr = self._mgr.reindex_axis( value.index.copy(), axis=1, fill_value=np.nan ) @@ -2949,16 +3083,16 @@ def query(self, expr, inplace=False, **kwargs): res = self.eval(expr, **kwargs) try: - new_data = self.loc[res] + result = self.loc[res] except ValueError: # when res is multi-dimensional loc raises, but this is sometimes a # valid query - new_data = self[res] + result = self[res] if inplace: - self._update_inplace(new_data) + self._update_inplace(result) else: - return new_data + return result def eval(self, expr, inplace=False, **kwargs): """ @@ -3225,7 +3359,7 @@ def insert(self, loc, column, value, allow_duplicates=False) -> None: """ self._ensure_valid_index(value) value = self._sanitize_column(column, value, broadcast=False) - self._data.insert(loc, column, value, allow_duplicates=allow_duplicates) + self._mgr.insert(loc, column, value, allow_duplicates=allow_duplicates) def assign(self, **kwargs) -> "DataFrame": r""" @@ -3338,7 +3472,7 @@ def reindexer(value): # other raise TypeError( "incompatible index of inserted column with frame index" - ) + ) from err return value if isinstance(value, Series): @@ -3406,7 +3540,7 @@ def reindexer(value): @property def _series(self): return { - item: Series(self._data.iget(idx), index=self.index, name=item) + item: Series(self._mgr.iget(idx), index=self.index, name=item) for idx, item in enumerate(self.columns) } @@ -3432,6 +3566,9 @@ def lookup(self, row_labels, col_labels) -> np.ndarray: n = len(row_labels) if n != len(col_labels): raise ValueError("Row labels must have same size as column labels") + if not (self.index.is_unique and self.columns.is_unique): + # GH#33041 + raise ValueError("DataFrame.lookup requires unique index and columns") thresh = 1000 if not self._is_mixed_type or n > thresh: @@ -3534,7 +3671,7 @@ def _reindex_multi(self, axes, copy, fill_value) -> "DataFrame": fill_value=fill_value, ) - @Appender(_shared_docs["align"] % _shared_doc_kwargs) + @doc(NDFrame.align, **_shared_doc_kwargs) def align( self, other, @@ -3600,7 +3737,7 @@ def align( see_also_sub=" or columns", ) @Appender(NDFrame.set_axis.__doc__) - def set_axis(self, labels, axis=0, inplace=False): + def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): return super().set_axis(labels, axis=axis, inplace=inplace) @Substitution(**_shared_doc_kwargs) @@ -3622,7 +3759,7 @@ def reindex(self, *args, **kwargs) -> "DataFrame": # Pop these, since the values are in `kwargs` under different names kwargs.pop("axis", None) kwargs.pop("labels", None) - return self._ensure_type(super().reindex(**kwargs)) + return super().reindex(**kwargs) def drop( self, @@ -3652,13 +3789,9 @@ def drop( index : single label or list-like Alternative to specifying axis (``labels, axis=0`` is equivalent to ``index=labels``). - - .. versionadded:: 0.21.0 columns : single label or list-like Alternative to specifying axis (``labels, axis=1`` is equivalent to ``columns=labels``). - - .. versionadded:: 0.21.0 level : int or level name, optional For MultiIndex, level from which the labels will be removed. inplace : bool, default False @@ -3804,7 +3937,7 @@ def rename( columns : dict-like or function Alternative to specifying axis (``mapper, axis=1`` is equivalent to ``columns=mapper``). - axis : int or str + axis : {0 or 'index', 1 or 'columns'}, default 0 Axis to target with ``mapper``. Can be either the axis name ('index', 'columns') or number (0, 1). The default is 'index'. copy : bool, default True @@ -3919,7 +4052,7 @@ def fillna( downcast=downcast, ) - @Appender(_shared_docs["replace"] % _shared_doc_kwargs) + @doc(NDFrame.replace, **_shared_doc_kwargs) def replace( self, to_replace=None, @@ -3938,10 +4071,45 @@ def replace( method=method, ) - @Appender(_shared_docs["shift"] % _shared_doc_kwargs) + def _replace_columnwise( + self, mapping: Dict[Label, Tuple[Any, Any]], inplace: bool, regex + ): + """ + Dispatch to Series.replace column-wise. + + + Parameters + ---------- + mapping : dict + of the form {col: (target, value)} + inplace : bool + regex : bool or same types as `to_replace` in DataFrame.replace + + Returns + ------- + DataFrame or None + """ + # Operate column-wise + res = self if inplace else self.copy() + ax = self.columns + + for i in range(len(ax)): + if ax[i] in mapping: + ser = self.iloc[:, i] + + target, value = mapping[ax[i]] + newobj = ser.replace(target, value, regex=regex) + + res.iloc[:, i] = newobj + + if inplace: + return + return res.__finalize__(self) + + @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "DataFrame": - return self._ensure_type( - super().shift(periods=periods, freq=freq, axis=axis, fill_value=fill_value) + return super().shift( + periods=periods, freq=freq, axis=axis, fill_value=fill_value ) def set_index( @@ -4046,7 +4214,7 @@ def set_index( "one-dimensional arrays." ) - missing: List[Optional[Hashable]] = [] + missing: List[Label] = [] for col in keys: if isinstance( col, (ABCIndexClass, ABCSeries, np.ndarray, list, abc.Iterator) @@ -4059,8 +4227,10 @@ def set_index( # everything else gets tried as a key; see GH 24969 try: found = col in self.columns - except TypeError: - raise TypeError(f"{err_msg}. Received column of type {type(col)}") + except TypeError as err: + raise TypeError( + f"{err_msg}. Received column of type {type(col)}" + ) from err else: if not found: missing.append(col) @@ -4083,7 +4253,7 @@ def set_index( else: arrays.append(self.index) - to_remove: List[Optional[Hashable]] = [] + to_remove: List[Label] = [] for col in keys: if isinstance(col, ABCMultiIndex): for n in range(col.nlevels): @@ -4138,7 +4308,7 @@ def reset_index( drop: bool = False, inplace: bool = False, col_level: Hashable = 0, - col_fill: Optional[Hashable] = "", + col_fill: Label = "", ) -> Optional["DataFrame"]: """ Reset the index, or a level of it. @@ -4314,7 +4484,7 @@ def _maybe_casted_values(index, labels=None): values_dtype = values.dtype if issubclass(values_type, DatetimeLikeArray): - values = values._data + values = values._data # TODO: can we de-kludge yet? if mask.any(): values, _ = maybe_upcast_putmask(values, mask, np.nan) @@ -4380,19 +4550,20 @@ def _maybe_casted_values(index, labels=None): @Appender(_shared_docs["isna"] % _shared_doc_kwargs) def isna(self) -> "DataFrame": - return super().isna() + result = self._constructor(self._data.isna(func=isna)) + return result.__finalize__(self, method="isna") @Appender(_shared_docs["isna"] % _shared_doc_kwargs) def isnull(self) -> "DataFrame": - return super().isnull() + return self.isna() @Appender(_shared_docs["notna"] % _shared_doc_kwargs) def notna(self) -> "DataFrame": - return super().notna() + return ~self.isna() @Appender(_shared_docs["notna"] % _shared_doc_kwargs) def notnull(self) -> "DataFrame": - return super().notnull() + return ~self.isna() def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): """ @@ -4573,6 +4744,47 @@ def drop_duplicates( See Also -------- DataFrame.value_counts: Count unique combinations of columns. + + Examples + -------- + Consider dataset containing ramen rating. + + >>> df = pd.DataFrame({ + ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], + ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], + ... 'rating': [4, 4, 3.5, 15, 5] + ... }) + >>> df + brand style rating + 0 Yum Yum cup 4.0 + 1 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 3 Indomie pack 15.0 + 4 Indomie pack 5.0 + + By default, it removes duplicate rows based on all columns. + + >>> df.drop_duplicates() + brand style rating + 0 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 3 Indomie pack 15.0 + 4 Indomie pack 5.0 + + To remove duplicates on specific column(s), use ``subset``. + + >>> df.drop_duplicates(subset=['brand']) + brand style rating + 0 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + + To remove duplicates and keep last occurences, use ``keep``. + + >>> df.drop_duplicates(subset=['brand', 'style'], keep='last') + brand style rating + 1 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 4 Indomie pack 5.0 """ if self.empty: return self.copy() @@ -4580,22 +4792,16 @@ def drop_duplicates( inplace = validate_bool_kwarg(inplace, "inplace") duplicated = self.duplicated(subset, keep=keep) - if inplace: - (inds,) = (-duplicated)._ndarray_values.nonzero() - new_data = self._data.take(inds) + result = self[-duplicated] + if ignore_index: + result.index = ibase.default_index(len(result)) - if ignore_index: - new_data.axes[1] = ibase.default_index(len(inds)) - self._update_inplace(new_data) + if inplace: + self._update_inplace(result) + return None else: - result = self[-duplicated] - - if ignore_index: - result.index = ibase.default_index(len(result)) return result - return None - def duplicated( self, subset: Optional[Union[Hashable, Sequence[Hashable]]] = None, @@ -4621,6 +4827,73 @@ def duplicated( Returns ------- Series + Boolean series for each duplicated rows. + + See Also + -------- + Index.duplicated : Equivalent method on index. + Series.duplicated : Equivalent method on Series. + Series.drop_duplicates : Remove duplicate values from Series. + DataFrame.drop_duplicates : Remove duplicate values from DataFrame. + + Examples + -------- + Consider dataset containing ramen rating. + + >>> df = pd.DataFrame({ + ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], + ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], + ... 'rating': [4, 4, 3.5, 15, 5] + ... }) + >>> df + brand style rating + 0 Yum Yum cup 4.0 + 1 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 3 Indomie pack 15.0 + 4 Indomie pack 5.0 + + By default, for each set of duplicated values, the first occurrence + is set on False and all others on True. + + >>> df.duplicated() + 0 False + 1 True + 2 False + 3 False + 4 False + dtype: bool + + By using 'last', the last occurrence of each set of duplicated values + is set on False and all others on True. + + >>> df.duplicated(keep='last') + 0 True + 1 False + 2 False + 3 False + 4 False + dtype: bool + + By setting ``keep`` on False, all duplicates are True. + + >>> df.duplicated(keep=False) + 0 True + 1 True + 2 False + 3 False + 4 False + dtype: bool + + To find duplicates on specific column(s), use ``subset``. + + >>> df.duplicated(subset=['brand']) + 0 False + 1 True + 2 False + 3 True + 4 True + dtype: bool """ from pandas.core.sorting import get_group_index from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT @@ -4703,17 +4976,18 @@ def sort_values( k, kind=kind, ascending=ascending, na_position=na_position ) - new_data = self._data.take( + new_data = self._mgr.take( indexer, axis=self._get_block_manager_axis(axis), verify=False ) if ignore_index: new_data.axes[1] = ibase.default_index(len(indexer)) + result = self._constructor(new_data) if inplace: - return self._update_inplace(new_data) + return self._update_inplace(result) else: - return self._constructor(new_data).__finalize__(self) + return result.__finalize__(self, method="sort_values") def sort_index( self, @@ -4729,6 +5003,9 @@ def sort_index( """ Sort object by labels (along an axis). + Returns a new DataFrame sorted by label if `inplace` argument is + ``False``, otherwise updates the original DataFrame and returns None. + Parameters ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 @@ -4759,8 +5036,37 @@ def sort_index( Returns ------- - sorted_obj : DataFrame or None - DataFrame with sorted index if inplace=False, None otherwise. + DataFrame + The original DataFrame sorted by the labels. + + See Also + -------- + Series.sort_index : Sort Series by the index. + DataFrame.sort_values : Sort DataFrame by the value. + Series.sort_values : Sort Series by the value. + + Examples + -------- + >>> df = pd.DataFrame([1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150], + ... columns=['A']) + >>> df.sort_index() + A + 1 4 + 29 2 + 100 1 + 150 5 + 234 3 + + By default, it sorts in ascending order, to sort in descending order, + use ``ascending=False`` + + >>> df.sort_index(ascending=False) + A + 234 3 + 150 5 + 100 1 + 29 2 + 1 4 """ # TODO: this can be combined with Series.sort_index impl as # almost identical @@ -4805,7 +5111,7 @@ def sort_index( ) baxis = self._get_block_manager_axis(axis) - new_data = self._data.take(indexer, axis=baxis, verify=False) + new_data = self._mgr.take(indexer, axis=baxis, verify=False) # reconstruct axis if needed new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic() @@ -4813,10 +5119,11 @@ def sort_index( if ignore_index: new_data.axes[1] = ibase.default_index(len(indexer)) + result = self._constructor(new_data) if inplace: - return self._update_inplace(new_data) + return self._update_inplace(result) else: - return self._constructor(new_data).__finalize__(self) + return result.__finalize__(self, method="sort_index") def value_counts( self, @@ -5133,6 +5440,9 @@ def swaplevel(self, i=-2, j=-1, axis=0) -> "DataFrame": ---------- i, j : int or str Levels of the indices to be swapped. Can pass level name as string. + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to swap levels on. 0 or 'index' for row-wise, 1 or + 'columns' for column-wise. Returns ------- @@ -5162,7 +5472,7 @@ def reorder_levels(self, order, axis=0) -> "DataFrame": order : list of int or list of str List representing new level order. Reference level by number (position) or by key (label). - axis : int + axis : {0 or 'index', 1 or 'columns'}, default 0 Where to reorder levels. Returns @@ -5213,20 +5523,6 @@ def _arith_op(left, right): return new_data - def _combine_match_index(self, other: Series, func): - # at this point we have `self.index.equals(other.index)` - - if ops.should_series_dispatch(self, other, func): - # operate column-wise; avoid costly object-casting in `.values` - new_data = ops.dispatch_to_series(self, other, func) - else: - # fastpath --> operate directly on values - other_vals = other.values.reshape(-1, 1) - with np.errstate(all="ignore"): - new_data = func(self.values, other_vals) - new_data = dispatch_fill_zeros(func, self.values, other_vals, new_data) - return new_data - def _construct_result(self, result) -> "DataFrame": """ Wrap the result of an arithmetic, comparison, or logical operation. @@ -5900,7 +6196,8 @@ def pivot(self, index=None, columns=None, values=None) -> "DataFrame": If dict is passed, the key is column to aggregate and value is function or list of functions. fill_value : scalar, default None - Value to replace missing values with. + Value to replace missing values with (in the resulting pivot table, + after aggregation). margins : bool, default False Add all row / columns (e.g. for subtotal / grand totals). dropna : bool, default True @@ -6367,10 +6664,12 @@ def unstack(self, level=-1, fill_value=None): See Also -------- - %(other)s - pivot_table - DataFrame.pivot - Series.explode + %(other)s : Identical method. + pivot_table : Create a spreadsheet-style pivot table as a DataFrame. + DataFrame.pivot : Return reshaped DataFrame organized + by given index / column values. + DataFrame.explode : Explode a DataFrame from list-like + columns to long format. Examples -------- @@ -6460,7 +6759,7 @@ def melt( # ---------------------------------------------------------------------- # Time series-related - def diff(self, periods=1, axis=0) -> "DataFrame": + def diff(self, periods: int = 1, axis: Axis = 0) -> "DataFrame": """ First discrete difference of element. @@ -6551,7 +6850,12 @@ def diff(self, periods=1, axis=0) -> "DataFrame": 5 NaN NaN NaN """ bm_axis = self._get_block_manager_axis(axis) - new_data = self._data.diff(n=periods, axis=bm_axis) + self._consolidate_inplace() + + if bm_axis == 0 and periods != 0: + return self.T.diff(periods, axis=0).T + + new_data = self._mgr.diff(n=periods, axis=bm_axis) return self._constructor(new_data) # ---------------------------------------------------------------------- @@ -6784,7 +7088,7 @@ def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): 2 [1, 2] dtype: object - Passing result_type='expand' will expand list-like results + Passing ``result_type='expand'`` will expand list-like results to columns of a Dataframe >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand') @@ -7392,8 +7696,9 @@ def corr(self, method="pearson", min_periods=1) -> "DataFrame": See Also -------- - DataFrame.corrwith - Series.corr + DataFrame.corrwith : Compute pairwise correlation with another + DataFrame or Series. + Series.corr : Compute the correlation between two Series. Examples -------- @@ -7595,7 +7900,7 @@ def corrwith(self, other, axis=0, drop=False, method="pearson") -> Series: See Also -------- - DataFrame.corr + DataFrame.corr : Compute pairwise correlation of columns. """ axis = self._get_axis_number(axis) this = self._get_numeric_data() @@ -7666,7 +7971,7 @@ def count(self, axis=0, level=None, numeric_only=False): ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 If 0 or 'index' counts are generated for each column. - If 1 or 'columns' counts are generated for each **row**. + If 1 or 'columns' counts are generated for each row. level : int or str, optional If the axis is a `MultiIndex` (hierarchical), count along a particular `level`, collapsing into a `DataFrame`. @@ -7744,7 +8049,7 @@ def count(self, axis=0, level=None, numeric_only=False): if len(frame._get_axis(axis)) == 0: result = Series(0, index=frame._get_agg_axis(axis)) else: - if frame._is_mixed_type or frame._data.any_extension_types: + if frame._is_mixed_type or frame._mgr.any_extension_types: # the or any_extension_types is really only hit for single- # column frames with an extension array result = notna(frame).sum(axis=axis) @@ -7770,18 +8075,21 @@ def _count_level(self, level, axis=0, numeric_only=False): f"Can only count levels on hierarchical {self._get_axis_name(axis)}." ) + # Mask NaNs: Mask rows or columns where the index level is NaN, and all + # values in the DataFrame that are NaN if frame._is_mixed_type: # Since we have mixed types, calling notna(frame.values) might # upcast everything to object - mask = notna(frame).values + values_mask = notna(frame).values else: # But use the speedup when we have homogeneous dtypes - mask = notna(frame.values) + values_mask = notna(frame.values) + index_mask = notna(count_axis.get_level_values(level=level)) if axis == 1: - # We're transposing the mask rather than frame to avoid potential - # upcasts to object, which induces a ~20x slowdown - mask = mask.T + mask = index_mask & values_mask + else: + mask = index_mask.reshape(-1, 1) & values_mask if isinstance(level, str): level = count_axis._get_level_number(level) @@ -7789,25 +8097,32 @@ def _count_level(self, level, axis=0, numeric_only=False): level_name = count_axis._names[level] level_index = count_axis.levels[level]._shallow_copy(name=level_name) level_codes = ensure_int64(count_axis.codes[level]) - counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=0) - - result = DataFrame(counts, index=level_index, columns=agg_axis) + counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=axis) if axis == 1: - # Undo our earlier transpose - return result.T + result = DataFrame(counts, index=agg_axis, columns=level_index) else: - return result + result = DataFrame(counts, index=level_index, columns=agg_axis) + + return result def _reduce( self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds ): - dtype_is_dt = self.dtypes.apply(lambda x: x.kind == "M") + assert filter_type is None or filter_type == "bool", filter_type + + dtype_is_dt = np.array( + [ + is_datetime64_any_dtype(values.dtype) or is_period_dtype(values.dtype) + for values in self._iter_column_arrays() + ], + dtype=bool, + ) if numeric_only is None and name in ["mean", "median"] and dtype_is_dt.any(): warnings.warn( "DataFrame.mean and DataFrame.median with numeric_only=None " - "will include datetime64 and datetime64tz columns in a " + "will include datetime64, datetime64tz, and PeriodDtype columns in a " "future version.", FutureWarning, stacklevel=3, @@ -7828,7 +8143,7 @@ def f(x): return op(x, axis=axis, skipna=skipna, **kwds) def _get_data(axis_matters): - if filter_type is None or filter_type == "numeric": + if filter_type is None: data = self._get_numeric_data() elif filter_type == "bool": if axis_matters: @@ -7862,73 +8177,89 @@ def blk_func(values): # After possibly _get_data and transposing, we are now in the # simple case where we can use BlockManager._reduce - res = df._data.reduce(blk_func) + res = df._mgr.reduce(blk_func) assert isinstance(res, dict) if len(res): assert len(res) == max(list(res.keys())) + 1, res.keys() out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype) out.index = df.columns + if axis == 0 and df.dtypes.apply(needs_i8_conversion).any(): + # FIXME: needs_i8_conversion check is kludge, not sure + # why it is necessary in this case and this case alone + out[:] = coerce_to_dtypes(out.values, df.dtypes) return out + if not self._is_homogeneous_type: + # try to avoid self.values call + + if filter_type is None and axis == 0 and len(self) > 0: + # operate column-wise + + # numeric_only must be None here, as other cases caught above + # require len(self) > 0 bc frame_apply messes up empty prod/sum + + # this can end up with a non-reduction + # but not always. if the types are mixed + # with datelike then need to make sure a series + + # we only end up here if we have not specified + # numeric_only and yet we have tried a + # column-by-column reduction, where we have mixed type. + # So let's just do what we can + from pandas.core.apply import frame_apply + + opa = frame_apply( + self, func=f, result_type="expand", ignore_failures=True + ) + result = opa.get_result() + if result.ndim == self.ndim: + result = result.iloc[0].rename(None) + return result + + data = self if numeric_only is None: - values = self.values + data = self + values = data.values + try: result = f(values) - if filter_type == "bool" and is_object_dtype(values) and axis is None: - # work around https://github.com/numpy/numpy/issues/10489 - # TODO: combine with hasattr(result, 'dtype') further down - # hard since we don't have `values` down there. - result = np.bool_(result) except TypeError: # e.g. in nanops trying to convert strs to float - # try by-column first - if filter_type is None and axis == 0: - # this can end up with a non-reduction - # but not always. if the types are mixed - # with datelike then need to make sure a series - - # we only end up here if we have not specified - # numeric_only and yet we have tried a - # column-by-column reduction, where we have mixed type. - # So let's just do what we can - from pandas.core.apply import frame_apply - - opa = frame_apply( - self, func=f, result_type="expand", ignore_failures=True - ) - result = opa.get_result() - if result.ndim == self.ndim: - result = result.iloc[0] - return result - # TODO: why doesnt axis matter here? data = _get_data(axis_matters=False) - with np.errstate(all="ignore"): - result = f(data.values) labels = data._get_agg_axis(axis) + + values = data.values + with np.errstate(all="ignore"): + result = f(values) + else: if numeric_only: data = _get_data(axis_matters=True) + labels = data._get_agg_axis(axis) values = data.values - labels = data._get_agg_axis(axis) else: - values = self.values + data = self + values = data.values result = f(values) - if hasattr(result, "dtype") and is_object_dtype(result.dtype): + if filter_type == "bool" and is_object_dtype(values) and axis is None: + # work around https://github.com/numpy/numpy/issues/10489 + # TODO: can we de-duplicate parts of this with the next blocK? + result = np.bool_(result) + elif hasattr(result, "dtype") and is_object_dtype(result.dtype): try: - if filter_type is None or filter_type == "numeric": + if filter_type is None: result = result.astype(np.float64) elif filter_type == "bool" and notna(result).all(): result = result.astype(np.bool_) except (ValueError, TypeError): - # try to coerce to the original dtypes item by item if we can if axis == 0: - result = coerce_to_dtypes(result, self.dtypes) + result = coerce_to_dtypes(result, data.dtypes) if constructor is not None: result = self._constructor_sliced(result, index=labels) @@ -8000,11 +8331,40 @@ def idxmin(self, axis=0, skipna=True) -> Series: See Also -------- - Series.idxmin + Series.idxmin : Return index of the minimum element. Notes ----- This method is the DataFrame version of ``ndarray.argmin``. + + Examples + -------- + Consider a dataset containing food consumption in Argentina. + + >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], + ... 'co2_emissions': [37.2, 19.66, 1712]}, + ... index=['Pork', 'Wheat Products', 'Beef']) + + >>> df + consumption co2_emissions + Pork 10.51 37.20 + Wheat Products 103.11 19.66 + Beef 55.48 1712.00 + + By default, it returns the index for the minimum value in each column. + + >>> df.idxmin() + consumption Pork + co2_emissions Wheat Products + dtype: object + + To return the index for the minimum value in each row, use ``axis="columns"``. + + >>> df.idxmin(axis="columns") + Pork consumption + Wheat Products co2_emissions + Beef consumption + dtype: object """ axis = self._get_axis_number(axis) indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna) @@ -8038,11 +8398,40 @@ def idxmax(self, axis=0, skipna=True) -> Series: See Also -------- - Series.idxmax + Series.idxmax : Return index of the maximum element. Notes ----- This method is the DataFrame version of ``ndarray.argmax``. + + Examples + -------- + Consider a dataset containing food consumption in Argentina. + + >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], + ... 'co2_emissions': [37.2, 19.66, 1712]}, + ... index=['Pork', 'Wheat Products', 'Beef']) + + >>> df + consumption co2_emissions + Pork 10.51 37.20 + Wheat Products 103.11 19.66 + Beef 55.48 1712.00 + + By default, it returns the index for the maximum value in each column. + + >>> df.idxmax() + consumption Wheat Products + co2_emissions Beef + dtype: object + + To return the index for the maximum value in each row, use ``axis="columns"``. + + >>> df.idxmax(axis="columns") + Pork co2_emissions + Wheat Products consumption + Beef co2_emissions + dtype: object """ axis = self._get_axis_number(axis) indices = nanops.nanargmax(self.values, axis=axis, skipna=skipna) @@ -8156,7 +8545,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"): ---------- q : float or array-like, default 0.5 (50% quantile) Value between 0 <= q <= 1, the quantile(s) to compute. - axis : {0, 1, 'index', 'columns'} (default 0) + axis : {0, 1, 'index', 'columns'}, default 0 Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise. numeric_only : bool, default True If False, the quantile of datetime and timedelta data will be @@ -8230,7 +8619,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"): return self._constructor([], index=q, columns=cols) return self._constructor_sliced([], index=cols, name=q, dtype=np.float64) - result = data._data.quantile( + result = data._mgr.quantile( qs=q, axis=1, interpolation=interpolation, transposed=is_transposed ) @@ -8244,7 +8633,9 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"): return result - def to_timestamp(self, freq=None, how="start", axis=0, copy=True) -> "DataFrame": + def to_timestamp( + self, freq=None, how: str = "start", axis: Axis = 0, copy: bool = True + ) -> "DataFrame": """ Cast to DatetimeIndex of timestamps, at *beginning* of period. @@ -8264,23 +8655,16 @@ def to_timestamp(self, freq=None, how="start", axis=0, copy=True) -> "DataFrame" ------- DataFrame with DatetimeIndex """ - new_data = self._data - if copy: - new_data = new_data.copy() + new_obj = self.copy(deep=copy) - axis = self._get_axis_number(axis) - if axis == 0: - assert isinstance(self.index, (ABCDatetimeIndex, ABCPeriodIndex)) - new_data.set_axis(1, self.index.to_timestamp(freq=freq, how=how)) - elif axis == 1: - assert isinstance(self.columns, (ABCDatetimeIndex, ABCPeriodIndex)) - new_data.set_axis(0, self.columns.to_timestamp(freq=freq, how=how)) - else: # pragma: no cover - raise AssertionError(f"Axis must be 0 or 1. Got {axis}") + axis_name = self._get_axis_name(axis) + old_ax = getattr(self, axis_name) + new_ax = old_ax.to_timestamp(freq=freq, how=how) - return self._constructor(new_data) + setattr(new_obj, axis_name, new_ax) + return new_obj - def to_period(self, freq=None, axis=0, copy=True) -> "DataFrame": + def to_period(self, freq=None, axis: Axis = 0, copy: bool = True) -> "DataFrame": """ Convert DataFrame from DatetimeIndex to PeriodIndex. @@ -8298,23 +8682,16 @@ def to_period(self, freq=None, axis=0, copy=True) -> "DataFrame": Returns ------- - TimeSeries with PeriodIndex + DataFrame with PeriodIndex """ - new_data = self._data - if copy: - new_data = new_data.copy() + new_obj = self.copy(deep=copy) - axis = self._get_axis_number(axis) - if axis == 0: - assert isinstance(self.index, ABCDatetimeIndex) - new_data.set_axis(1, self.index.to_period(freq=freq)) - elif axis == 1: - assert isinstance(self.columns, ABCDatetimeIndex) - new_data.set_axis(0, self.columns.to_period(freq=freq)) - else: # pragma: no cover - raise AssertionError(f"Axis must be 0 or 1. Got {axis}") + axis_name = self._get_axis_name(axis) + old_ax = getattr(self, axis_name) + new_ax = old_ax.to_period(freq=freq) - return self._constructor(new_data) + setattr(new_obj, axis_name, new_ax) + return new_obj def isin(self, values) -> "DataFrame": """ @@ -8382,14 +8759,12 @@ def isin(self, values) -> "DataFrame": from pandas.core.reshape.concat import concat values = collections.defaultdict(list, values) - return self._ensure_type( - concat( - ( - self.iloc[:, [i]].isin(values[col]) - for i, col in enumerate(self.columns) - ), - axis=1, - ) + return concat( + ( + self.iloc[:, [i]].isin(values[col]) + for i, col in enumerate(self.columns) + ), + axis=1, ) elif isinstance(values, Series): if not values.index.is_unique: @@ -8446,9 +8821,8 @@ def isin(self, values) -> "DataFrame": def _from_nested_dict(data): # TODO: this should be seriously cythonized - new_data = {} + new_data = collections.defaultdict(dict) for index, s in data.items(): for col, v in s.items(): - new_data[col] = new_data.get(col, {}) new_data[col][index] = v return new_data diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ff7c481d550d4..6a4f83427310e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -30,7 +30,7 @@ from pandas._config import config -from pandas._libs import Timestamp, iNaT, lib +from pandas._libs import Timestamp, lib from pandas._typing import ( Axis, FilePathOrBuffer, @@ -72,7 +72,6 @@ is_number, is_numeric_dtype, is_object_dtype, - is_period_arraylike, is_re_compilable, is_scalar, is_timedelta64_dtype, @@ -148,7 +147,7 @@ def _single_replace(self, to_replace, method, inplace, limit): result = pd.Series(values, index=self.index, dtype=self.dtype).__finalize__(self) if inplace: - self._update_inplace(result._data) + self._update_inplace(result) return return result @@ -170,7 +169,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): """ _internal_names: List[str] = [ - "_data", + "_mgr", "_cacher", "_item_cache", "_cache", @@ -189,7 +188,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): _deprecations: FrozenSet[str] = frozenset(["get_values"]) _metadata: List[str] = [] _is_copy = None - _data: BlockManager + _mgr: BlockManager _attrs: Dict[Optional[Hashable], Any] _typ: str @@ -205,7 +204,7 @@ def __init__( # copy kwarg is retained for mypy compat, is not used object.__setattr__(self, "_is_copy", None) - object.__setattr__(self, "_data", data) + object.__setattr__(self, "_mgr", data) object.__setattr__(self, "_item_cache", {}) if attrs is None: attrs = {} @@ -214,13 +213,13 @@ def __init__( object.__setattr__(self, "_attrs", attrs) @classmethod - def _init_mgr(cls, mgr, axes=None, dtype=None, copy=False): + def _init_mgr(cls, mgr, axes, dtype=None, copy: bool = False) -> BlockManager: """ passed a manager and a axes dict """ for a, axe in axes.items(): if axe is not None: - mgr = mgr.reindex_axis( - axe, axis=cls._get_block_manager_axis(a), copy=False - ) + axe = ensure_index(axe) + bm_axis = cls._get_block_manager_axis(a) + mgr = mgr.reindex_axis(axe, axis=bm_axis, copy=False) # make a copy if explicitly requested if copy: @@ -292,6 +291,15 @@ def _constructor_expanddim(self): """ raise NotImplementedError + # ---------------------------------------------------------------------- + # Internals + + @property + def _data(self): + # GH#33054 retained because some downstream packages uses this, + # e.g. fastparquet + return self._mgr + # ---------------------------------------------------------------------- # Axis _AXIS_ALIASES = {"rows": 0} @@ -335,9 +343,11 @@ def _construct_axes_from_arguments( if a not in kwargs: try: kwargs[a] = args.pop(0) - except IndexError: + except IndexError as err: if require_all: - raise TypeError("not enough/duplicate arguments specified!") + raise TypeError( + "not enough/duplicate arguments specified!" + ) from err axes = {a: kwargs.pop(a, sentinel) for a in cls._AXIS_ORDERS} return axes, kwargs @@ -353,7 +363,7 @@ def _get_axis_number(cls, axis): return cls._AXIS_NUMBERS[axis] except KeyError: pass - raise ValueError(f"No axis named {axis} for object type {cls}") + raise ValueError(f"No axis named {axis} for object type {cls.__name__}") @classmethod def _get_axis_name(cls, axis): @@ -366,7 +376,7 @@ def _get_axis_name(cls, axis): return cls._AXIS_NAMES[axis] except KeyError: pass - raise ValueError(f"No axis named {axis} for object type {cls}") + raise ValueError(f"No axis named {axis} for object type {cls.__name__}") def _get_axis(self, axis): name = self._get_axis_name(axis) @@ -482,7 +492,7 @@ def ndim(self) -> int: >>> df.ndim 2 """ - return self._data.ndim + return self._mgr.ndim @property def size(self) -> int: @@ -518,20 +528,13 @@ def _obj_with_exclusions(self: FrameOrSeries) -> FrameOrSeries: """ internal compat with SelectionMixin """ return self - def set_axis(self, labels, axis=0, inplace=False): + def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): """ Assign desired index to given axis. Indexes for%(extended_summary_sub)s row labels can be changed by assigning a list-like or Index. - .. versionchanged:: 0.21.0 - - The signature is now `labels` and `axis`, consistent with - the rest of pandas API. Previously, the `axis` and `labels` - arguments were respectively the first and second positional - arguments. - Parameters ---------- labels : list-like, Index @@ -559,8 +562,9 @@ def set_axis(self, labels, axis=0, inplace=False): obj.set_axis(labels, axis=axis, inplace=True) return obj - def _set_axis(self, axis, labels) -> None: - self._data.set_axis(axis, labels) + def _set_axis(self, axis: int, labels: Index) -> None: + labels = ensure_index(labels) + self._mgr.set_axis(axis, labels) self._clear_item_cache() def swapaxes(self: FrameOrSeries, axis1, axis2, copy=True) -> FrameOrSeries: @@ -586,7 +590,9 @@ def swapaxes(self: FrameOrSeries, axis1, axis2, copy=True) -> FrameOrSeries: if copy: new_values = new_values.copy() - return self._constructor(new_values, *new_axes).__finalize__(self) + return self._constructor(new_values, *new_axes).__finalize__( + self, method="swapaxes" + ) def droplevel(self: FrameOrSeries, level, axis=0) -> FrameOrSeries: """ @@ -965,7 +971,6 @@ def rename( continue ax = self._get_axis(axis_no) - baxis = self._get_block_manager_axis(axis_no) f = com.get_rename_function(replacements) if level is not None: @@ -982,16 +987,15 @@ def rename( ] raise KeyError(f"{missing_labels} not found in axis") - result._data = result._data.rename_axis( - f, axis=baxis, copy=copy, level=level - ) + new_index = ax._transform_index(f, level) + result.set_axis(new_index, axis=axis_no, inplace=True) result._clear_item_cache() if inplace: - self._update_inplace(result._data) + self._update_inplace(result) return None else: - return result.__finalize__(self) + return result.__finalize__(self, method="rename") @rewrite_axis_style_signature("mapper", [("copy", True), ("inplace", False)]) def rename_axis(self, mapper=lib.no_default, **kwargs): @@ -1178,8 +1182,6 @@ def _set_axis_name(self, name, axis=0, inplace=False): inplace : bool, default False If `True`, do operation inplace and return None. - .. versionadded:: 0.21.0 - Returns ------- Series, DataFrame, or None @@ -1210,7 +1212,7 @@ def _set_axis_name(self, name, axis=0, inplace=False): >>> df.index = pd.MultiIndex.from_product( ... [["mammal"], ['dog', 'cat', 'monkey']]) >>> df._set_axis_name(["type", "name"]) - legs + num_legs type name mammal dog 4 cat 4 @@ -1318,13 +1320,13 @@ def equals(self, other): """ if not isinstance(other, self._constructor): return False - return self._data.equals(other._data) + return self._mgr.equals(other._mgr) # ------------------------------------------------------------------------- # Unary Methods def __neg__(self): - values = com.values_from_object(self) + values = self._values if is_bool_dtype(values): arr = operator.inv(values) elif ( @@ -1338,8 +1340,8 @@ def __neg__(self): return self.__array_wrap__(arr) def __pos__(self): - values = com.values_from_object(self) - if is_bool_dtype(values) or is_period_arraylike(values): + values = self._values + if is_bool_dtype(values): arr = values elif ( is_numeric_dtype(values) @@ -1356,8 +1358,8 @@ def __invert__(self): # inv fails with 0 len return self - new_data = self._data.apply(operator.invert) - result = self._constructor(new_data).__finalize__(self) + new_data = self._mgr.apply(operator.invert) + result = self._constructor(new_data).__finalize__(self, method="__invert__") return result def __nonzero__(self): @@ -1723,7 +1725,7 @@ def items(self): for h in self._info_axis: yield h, self[h] - @Appender(items.__doc__) + @doc(items) def iteritems(self): return self.items() @@ -1750,8 +1752,9 @@ def empty(self) -> bool_t: See Also -------- - Series.dropna - DataFrame.dropna + Series.dropna : Return series without null values. + DataFrame.dropna : Return DataFrame with labels on given axis omitted + where (all or any) data are missing. Notes ----- @@ -1792,7 +1795,7 @@ def empty(self) -> bool_t: __array_priority__ = 1000 def __array__(self, dtype=None) -> np.ndarray: - return com.values_from_object(self) + return np.asarray(self._values, dtype=dtype) def __array_wrap__(self, result, context=None): result = lib.item_from_zerodim(result) @@ -1801,7 +1804,9 @@ def __array_wrap__(self, result, context=None): # ptp also requires the item_from_zerodim return result d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False) - return self._constructor(result, **d).__finalize__(self) + return self._constructor(result, **d).__finalize__( + self, method="__array_wrap__" + ) # ideally we would define this to avoid the getattr checks, but # is slower @@ -1817,7 +1822,7 @@ def __array_wrap__(self, result, context=None): def __getstate__(self) -> Dict[str, Any]: meta = {k: getattr(self, k, None) for k in self._metadata} return dict( - _data=self._data, + _mgr=self._mgr, _typ=self._typ, _metadata=self._metadata, attrs=self.attrs, @@ -1827,8 +1832,11 @@ def __getstate__(self) -> Dict[str, Any]: def __setstate__(self, state): if isinstance(state, BlockManager): - self._data = state + self._mgr = state elif isinstance(state, dict): + if "_data" in state and "_mgr" not in state: + # compat for older pickles + state["_mgr"] = state.pop("_data") typ = state.get("_typ") if typ is not None: attrs = state.get("_attrs", {}) @@ -1836,7 +1844,7 @@ def __setstate__(self, state): # set in the order of internal names # to avoid definitional recursion - # e.g. say fill_value needing _data to be + # e.g. say fill_value needing _mgr to be # defined meta = set(self._internal_names + self._metadata) for k in list(meta): @@ -1911,117 +1919,7 @@ def _repr_data_resource_(self): %(klass)s in Markdown-friendly format. """ - _shared_docs[ - "to_excel" - ] = """ - Write %(klass)s to an Excel sheet. - - To write a single %(klass)s to an Excel .xlsx file it is only necessary to - specify a target file name. To write to multiple sheets it is necessary to - create an `ExcelWriter` object with a target file name, and specify a sheet - in the file to write to. - - Multiple sheets may be written to by specifying unique `sheet_name`. - With all data written to the file it is necessary to save the changes. - Note that creating an `ExcelWriter` object with a file name that already - exists will result in the contents of the existing file being erased. - - Parameters - ---------- - excel_writer : str or ExcelWriter object - File path or existing ExcelWriter. - sheet_name : str, default 'Sheet1' - Name of sheet which will contain DataFrame. - na_rep : str, default '' - Missing data representation. - float_format : str, optional - Format string for floating point numbers. For example - ``float_format="%%.2f"`` will format 0.1234 to 0.12. - columns : sequence or list of str, optional - Columns to write. - header : bool or list of str, default True - Write out the column names. If a list of string is given it is - assumed to be aliases for the column names. - index : bool, default True - Write row names (index). - index_label : str or sequence, optional - Column label for index column(s) if desired. If not specified, and - `header` and `index` are True, then the index names are used. A - sequence should be given if the DataFrame uses MultiIndex. - startrow : int, default 0 - Upper left cell row to dump data frame. - startcol : int, default 0 - Upper left cell column to dump data frame. - engine : str, optional - Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this - via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and - ``io.excel.xlsm.writer``. - merge_cells : bool, default True - Write MultiIndex and Hierarchical Rows as merged cells. - encoding : str, optional - Encoding of the resulting excel file. Only necessary for xlwt, - other writers support unicode natively. - inf_rep : str, default 'inf' - Representation for infinity (there is no native representation for - infinity in Excel). - verbose : bool, default True - Display more information in the error logs. - freeze_panes : tuple of int (length 2), optional - Specifies the one-based bottommost row and rightmost column that - is to be frozen. - - See Also - -------- - to_csv : Write DataFrame to a comma-separated values (csv) file. - ExcelWriter : Class for writing DataFrame objects into excel sheets. - read_excel : Read an Excel file into a pandas DataFrame. - read_csv : Read a comma-separated values (csv) file into DataFrame. - - Notes - ----- - For compatibility with :meth:`~DataFrame.to_csv`, - to_excel serializes lists and dicts to strings before writing. - - Once a workbook has been saved it is not possible write further data - without rewriting the whole workbook. - - Examples - -------- - - Create, write to and save a workbook: - - >>> df1 = pd.DataFrame([['a', 'b'], ['c', 'd']], - ... index=['row 1', 'row 2'], - ... columns=['col 1', 'col 2']) - >>> df1.to_excel("output.xlsx") # doctest: +SKIP - - To specify the sheet name: - - >>> df1.to_excel("output.xlsx", - ... sheet_name='Sheet_name_1') # doctest: +SKIP - - If you wish to write to more than one sheet in the workbook, it is - necessary to specify an ExcelWriter object: - - >>> df2 = df1.copy() - >>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP - ... df1.to_excel(writer, sheet_name='Sheet_name_1') - ... df2.to_excel(writer, sheet_name='Sheet_name_2') - - ExcelWriter can also be used to append to an existing Excel file: - - >>> with pd.ExcelWriter('output.xlsx', - ... mode='a') as writer: # doctest: +SKIP - ... df.to_excel(writer, sheet_name='Sheet_name_3') - - To set the library that is used to write the Excel file, - you can pass the `engine` keyword (the default engine is - automatically chosen depending on the file extension): - - >>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP - """ - - @Appender(_shared_docs["to_excel"] % dict(klass="object")) + @doc(klass="object") def to_excel( self, excel_writer, @@ -2041,6 +1939,114 @@ def to_excel( verbose=True, freeze_panes=None, ) -> None: + """ + Write {klass} to an Excel sheet. + + To write a single {klass} to an Excel .xlsx file it is only necessary to + specify a target file name. To write to multiple sheets it is necessary to + create an `ExcelWriter` object with a target file name, and specify a sheet + in the file to write to. + + Multiple sheets may be written to by specifying unique `sheet_name`. + With all data written to the file it is necessary to save the changes. + Note that creating an `ExcelWriter` object with a file name that already + exists will result in the contents of the existing file being erased. + + Parameters + ---------- + excel_writer : str or ExcelWriter object + File path or existing ExcelWriter. + sheet_name : str, default 'Sheet1' + Name of sheet which will contain DataFrame. + na_rep : str, default '' + Missing data representation. + float_format : str, optional + Format string for floating point numbers. For example + ``float_format="%.2f"`` will format 0.1234 to 0.12. + columns : sequence or list of str, optional + Columns to write. + header : bool or list of str, default True + Write out the column names. If a list of string is given it is + assumed to be aliases for the column names. + index : bool, default True + Write row names (index). + index_label : str or sequence, optional + Column label for index column(s) if desired. If not specified, and + `header` and `index` are True, then the index names are used. A + sequence should be given if the DataFrame uses MultiIndex. + startrow : int, default 0 + Upper left cell row to dump data frame. + startcol : int, default 0 + Upper left cell column to dump data frame. + engine : str, optional + Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this + via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and + ``io.excel.xlsm.writer``. + merge_cells : bool, default True + Write MultiIndex and Hierarchical Rows as merged cells. + encoding : str, optional + Encoding of the resulting excel file. Only necessary for xlwt, + other writers support unicode natively. + inf_rep : str, default 'inf' + Representation for infinity (there is no native representation for + infinity in Excel). + verbose : bool, default True + Display more information in the error logs. + freeze_panes : tuple of int (length 2), optional + Specifies the one-based bottommost row and rightmost column that + is to be frozen. + + See Also + -------- + to_csv : Write DataFrame to a comma-separated values (csv) file. + ExcelWriter : Class for writing DataFrame objects into excel sheets. + read_excel : Read an Excel file into a pandas DataFrame. + read_csv : Read a comma-separated values (csv) file into DataFrame. + + Notes + ----- + For compatibility with :meth:`~DataFrame.to_csv`, + to_excel serializes lists and dicts to strings before writing. + + Once a workbook has been saved it is not possible write further data + without rewriting the whole workbook. + + Examples + -------- + + Create, write to and save a workbook: + + >>> df1 = pd.DataFrame([['a', 'b'], ['c', 'd']], + ... index=['row 1', 'row 2'], + ... columns=['col 1', 'col 2']) + >>> df1.to_excel("output.xlsx") # doctest: +SKIP + + To specify the sheet name: + + >>> df1.to_excel("output.xlsx", + ... sheet_name='Sheet_name_1') # doctest: +SKIP + + If you wish to write to more than one sheet in the workbook, it is + necessary to specify an ExcelWriter object: + + >>> df2 = df1.copy() + >>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP + ... df1.to_excel(writer, sheet_name='Sheet_name_1') + ... df2.to_excel(writer, sheet_name='Sheet_name_2') + + ExcelWriter can also be used to append to an existing Excel file: + + >>> with pd.ExcelWriter('output.xlsx', + ... mode='a') as writer: # doctest: +SKIP + ... df.to_excel(writer, sheet_name='Sheet_name_3') + + To set the library that is used to write the Excel file, + you can pass the `engine` keyword (the default engine is + automatically chosen depending on the file extension): + + >>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP + """ + df = self if isinstance(self, ABCDataFrame) else self.to_frame() from pandas.io.formats.excel import ExcelFormatter @@ -2147,7 +2153,6 @@ def to_json( only used when the first argument is a filename. By default, the compression is inferred from the filename. - .. versionadded:: 0.21.0 .. versionchanged:: 0.24.0 'infer' option added and set to default index : bool, default True @@ -2170,7 +2175,7 @@ def to_json( See Also -------- - read_json + read_json : Convert a JSON string to pandas object. Notes ----- @@ -2181,45 +2186,141 @@ def to_json( Examples -------- - >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']], - ... index=['row 1', 'row 2'], - ... columns=['col 1', 'col 2']) - >>> df.to_json(orient='split') - '{"columns":["col 1","col 2"], - "index":["row 1","row 2"], - "data":[["a","b"],["c","d"]]}' + >>> import json + >>> df = pd.DataFrame( + ... [["a", "b"], ["c", "d"]], + ... index=["row 1", "row 2"], + ... columns=["col 1", "col 2"], + ... ) + + >>> result = df.to_json(orient="split") + >>> parsed = json.loads(result) + >>> json.dumps(parsed, indent=4) # doctest: +SKIP + { + "columns": [ + "col 1", + "col 2" + ], + "index": [ + "row 1", + "row 2" + ], + "data": [ + [ + "a", + "b" + ], + [ + "c", + "d" + ] + ] + } Encoding/decoding a Dataframe using ``'records'`` formatted JSON. Note that index labels are not preserved with this encoding. - >>> df.to_json(orient='records') - '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]' + >>> result = df.to_json(orient="records") + >>> parsed = json.loads(result) + >>> json.dumps(parsed, indent=4) # doctest: +SKIP + [ + { + "col 1": "a", + "col 2": "b" + }, + { + "col 1": "c", + "col 2": "d" + } + ] Encoding/decoding a Dataframe using ``'index'`` formatted JSON: - >>> df.to_json(orient='index') - '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}' + >>> result = df.to_json(orient="index") + >>> parsed = json.loads(result) + >>> json.dumps(parsed, indent=4) # doctest: +SKIP + { + "row 1": { + "col 1": "a", + "col 2": "b" + }, + "row 2": { + "col 1": "c", + "col 2": "d" + } + } Encoding/decoding a Dataframe using ``'columns'`` formatted JSON: - >>> df.to_json(orient='columns') - '{"col 1":{"row 1":"a","row 2":"c"},"col 2":{"row 1":"b","row 2":"d"}}' + >>> result = df.to_json(orient="columns") + >>> parsed = json.loads(result) + >>> json.dumps(parsed, indent=4) # doctest: +SKIP + { + "col 1": { + "row 1": "a", + "row 2": "c" + }, + "col 2": { + "row 1": "b", + "row 2": "d" + } + } Encoding/decoding a Dataframe using ``'values'`` formatted JSON: - >>> df.to_json(orient='values') - '[["a","b"],["c","d"]]' - - Encoding with Table Schema + >>> result = df.to_json(orient="values") + >>> parsed = json.loads(result) + >>> json.dumps(parsed, indent=4) # doctest: +SKIP + [ + [ + "a", + "b" + ], + [ + "c", + "d" + ] + ] - >>> df.to_json(orient='table') - '{"schema": {"fields": [{"name": "index", "type": "string"}, - {"name": "col 1", "type": "string"}, - {"name": "col 2", "type": "string"}], - "primaryKey": "index", - "pandas_version": "0.20.0"}, - "data": [{"index": "row 1", "col 1": "a", "col 2": "b"}, - {"index": "row 2", "col 1": "c", "col 2": "d"}]}' + Encoding with Table Schema: + + >>> result = df.to_json(orient="table") + >>> parsed = json.loads(result) + >>> json.dumps(parsed, indent=4) # doctest: +SKIP + { + "schema": { + "fields": [ + { + "name": "index", + "type": "string" + }, + { + "name": "col 1", + "type": "string" + }, + { + "name": "col 2", + "type": "string" + } + ], + "primaryKey": [ + "index" + ], + "pandas_version": "0.20.0" + }, + "data": [ + { + "index": "row 1", + "col 1": "a", + "col 2": "b" + }, + { + "index": "row 2", + "col 1": "c", + "col 2": "d" + } + ] + } """ from pandas.io import json @@ -2568,7 +2669,6 @@ def to_pickle( parameter is equivalent to setting its value to HIGHEST_PROTOCOL. .. [1] https://docs.python.org/3/library/pickle.html. - .. versionadded:: 0.21.0. See Also -------- @@ -2646,7 +2746,8 @@ def to_clipboard( Copy the contents of a DataFrame to the clipboard. >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C']) - >>> df.to_clipboard(sep=',') + + >>> df.to_clipboard(sep=',') # doctest: +SKIP ... # Wrote the following to the system clipboard: ... # ,A,B,C ... # 0,1,2,3 @@ -2655,7 +2756,7 @@ def to_clipboard( We can omit the index by passing the keyword `index` and setting it to false. - >>> df.to_clipboard(sep=',', index=False) + >>> df.to_clipboard(sep=',', index=False) # doctest: +SKIP ... # Wrote the following to the system clipboard: ... # A,B,C ... # 1,2,3 @@ -2995,7 +3096,8 @@ def to_csv( compression mode is 'infer' and `path_or_buf` is path-like, then detect compression mode from the following extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given - and mode is 'zip' or inferred as 'zip', other entries passed as + and mode is one of {'zip', 'gzip', 'bz2'}, or inferred as + one of the above, other entries passed as additional compression options. .. versionchanged:: 1.0.0 @@ -3004,6 +3106,12 @@ def to_csv( and other entries as additional compression options if compression mode is 'zip'. + .. versionchanged:: 1.1.0 + + Passing compression options as keys in dict is + supported for compression modes 'gzip' and 'bz2' + as well as 'zip'. + quoting : optional constant from csv module Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` then floats are converted to strings and thus csv.QUOTE_NONNUMERIC @@ -3109,7 +3217,7 @@ def _maybe_cache_changed(self, item, value) -> None: """ The object has called back to us saying maybe it has changed. """ - self._data.set(item, value) + self._mgr.set(item, value) @property def _is_cached(self) -> bool_t: @@ -3151,7 +3259,7 @@ def _maybe_update_cacher( try: ref._maybe_cache_changed(cacher[0], self) except AssertionError: - # ref._data.setitem can raise + # ref._mgr.setitem can raise # AssertionError because of shape mismatch pass @@ -3261,10 +3369,10 @@ class max_speed self._consolidate_inplace() - new_data = self._data.take( + new_data = self._mgr.take( indices, axis=self._get_block_manager_axis(axis), verify=True ) - return self._constructor(new_data).__finalize__(self) + return self._constructor(new_data).__finalize__(self, method="take") def _take_with_is_copy(self: FrameOrSeries, indices, axis=0) -> FrameOrSeries: """ @@ -3422,7 +3530,7 @@ class animal locomotion # so just return them (GH 6394) return self._values[loc] - new_values = self._data.fast_xs(loc) + new_values = self._mgr.fast_xs(loc) result = self._constructor_sliced( new_values, @@ -3450,7 +3558,7 @@ def _get_item_cache(self, item): cache = self._item_cache res = cache.get(item) if res is None: - values = self._data.get(item) + values = self._mgr.get(item) res = self._box_item_values(item, values) cache[item] = res res._set_as_cached(item, self) @@ -3459,15 +3567,6 @@ def _get_item_cache(self, item): res._is_copy = self._is_copy return res - def _iget_item_cache(self, item: int): - """Return the cached item, item represents a positional indexer.""" - ax = self._info_axis - if ax.is_unique: - lower = self._get_item_cache(ax[item]) - else: - return self._ixs(item, axis=1) - return lower - def _box_item_values(self, key, values): raise AbstractMethodError(self) @@ -3479,7 +3578,7 @@ def _slice(self: FrameOrSeries, slobj: slice, axis=0) -> FrameOrSeries: """ assert isinstance(slobj, slice), type(slobj) axis = self._get_block_manager_axis(axis) - result = self._constructor(self._data.get_slice(slobj, axis=axis)) + result = self._constructor(self._mgr.get_slice(slobj, axis=axis)) result = result.__finalize__(self) # this could be a view @@ -3488,8 +3587,12 @@ def _slice(self: FrameOrSeries, slobj: slice, axis=0) -> FrameOrSeries: result._set_is_copy(self, copy=is_copy) return result + def _iset_item(self, loc: int, value) -> None: + self._mgr.iset(loc, value) + self._clear_item_cache() + def _set_item(self, key, value) -> None: - self._data.set(key, value) + self._mgr.set(key, value) self._clear_item_cache() def _set_is_copy(self, ref, copy: bool_t = True) -> None: @@ -3621,7 +3724,8 @@ def __delitem__(self, key) -> None: # If the above loop ran and didn't delete anything because # there was no match, this call should raise the appropriate # exception: - self._data.delete(key) + loc = self.axes[-1].get_loc(key) + self._mgr.idelete(loc) # delete from the caches try: @@ -3654,7 +3758,7 @@ def get(self, key, default=None): @property def _is_view(self) -> bool_t: """Return boolean indicating if self is view of another array """ - return self._data.is_view + return self._mgr.is_view def reindex_like( self: FrameOrSeries, @@ -3703,8 +3807,6 @@ def reindex_like( the same size as the index and its dtype must exactly match the index's type. - .. versionadded:: 0.21.0 (list-like tolerance) - Returns ------- Series or DataFrame @@ -3866,15 +3968,15 @@ def _update_inplace(self, result, verify_is_copy: bool_t = True) -> None: Parameters ---------- + result : same type as self verify_is_copy : bool, default True Provide is_copy checks. """ # NOTE: This does *not* call __finalize__ and that's an explicit # decision that we may revisit in the future. - self._reset_cache() self._clear_item_cache() - self._data = getattr(result, "_data", result) + self._mgr = result._mgr self._maybe_update_cacher(verify_is_copy=verify_is_copy) def add_prefix(self: FrameOrSeries, prefix: str) -> FrameOrSeries: @@ -4144,8 +4246,6 @@ def reindex(self: FrameOrSeries, *args, **kwargs) -> FrameOrSeries: the same size as the index and its dtype must exactly match the index's type. - .. versionadded:: 0.21.0 (list-like tolerance) - Returns ------- %(klass)s with changed index. @@ -4342,7 +4442,7 @@ def reindex(self: FrameOrSeries, *args, **kwargs) -> FrameOrSeries: # perform the reindex on the axes return self._reindex_axes( axes, level, limit, tolerance, method, fill_value, copy - ).__finalize__(self) + ).__finalize__(self, method="reindex") def _reindex_axes( self: FrameOrSeries, axes, level, limit, tolerance, method, fill_value, copy @@ -4390,7 +4490,7 @@ def _reindex_with_indexers( ) -> FrameOrSeries: """allow_dups indicates an internal call here """ # reindex doing multiple operations on different axes if indicated - new_data = self._data + new_data = self._mgr for axis in sorted(reindexers.keys()): index, indexer = reindexers[axis] baxis = self._get_block_manager_axis(axis) @@ -4411,8 +4511,10 @@ def _reindex_with_indexers( allow_dups=allow_dups, copy=copy, ) + # If we've made a copy once, no need to make another one + copy = False - if copy and new_data is self._data: + if copy and new_data is self._mgr: new_data = new_data.copy() return self._constructor(new_data).__finalize__(self) @@ -4449,7 +4551,8 @@ def filter( See Also -------- - DataFrame.loc + DataFrame.loc : Access a group of rows and columns + by label(s) or a boolean array. Notes ----- @@ -4464,6 +4567,10 @@ def filter( >>> df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])), ... index=['mouse', 'rabbit'], ... columns=['one', 'two', 'three']) + >>> df + one two three + mouse 1 2 3 + rabbit 4 5 6 >>> # select columns by name >>> df.filter(items=['one', 'three']) @@ -4696,9 +4803,16 @@ def sample( If weights do not sum to 1, they will be normalized to sum to 1. Missing values in the weights column will be treated as zero. Infinite values not allowed. - random_state : int or numpy.random.RandomState, optional - Seed for the random number generator (if int), or numpy RandomState - object. + random_state : int, array-like, BitGenerator, np.random.RandomState, optional + If int, array-like, or BitGenerator (NumPy>=1.17), seed for + random number generator + If np.random.RandomState, use as numpy RandomState object. + + ..versionchanged:: 1.1.0 + + array-like and BitGenerator (for NumPy>=1.17) object now passed to + np.random.RandomState() as seed + axis : {0 or ‘index’, 1 or ‘columns’, None}, default None Axis to sample. Accepts axis number or name. Default is stat axis for given data type (0 for Series and DataFrames). @@ -4792,10 +4906,10 @@ def sample( if axis == 0: try: weights = self[weights] - except KeyError: + except KeyError as err: raise KeyError( "String passed to weights not a valid column" - ) + ) from err else: raise ValueError( "Strings can only be passed to " @@ -4881,24 +4995,24 @@ def sample( See Also -------- - DataFrame.apply - DataFrame.applymap - Series.map + DataFrame.apply : Apply a function along input axis of DataFrame. + DataFrame.applymap : Apply a function elementwise on a whole DataFrame. + Series.map : Apply a mapping correspondence on a + :class:`~pandas.Series`. Notes ----- - Use ``.pipe`` when chaining together functions that expect Series, DataFrames or GroupBy objects. Instead of writing - >>> f(g(h(df), arg1=a), arg2=b, arg3=c) + >>> func(g(h(df), arg1=a), arg2=b, arg3=c) # doctest: +SKIP You can write >>> (df.pipe(h) ... .pipe(g, arg1=a) - ... .pipe(f, arg2=b, arg3=c) - ... ) + ... .pipe(func, arg2=b, arg3=c) + ... ) # doctest: +SKIP If you have a function that takes the data as (say) the second argument, pass a tuple indicating which keyword expects the @@ -4906,8 +5020,8 @@ def sample( >>> (df.pipe(h) ... .pipe(g, arg1=a) - ... .pipe((f, 'arg2'), arg1=a, arg3=c) - ... ) + ... .pipe((func, 'arg2'), arg1=a, arg3=c) + ... ) # doctest: +SKIP """ @Appender(_shared_docs["pipe"] % _shared_doc_kwargs) @@ -5029,7 +5143,7 @@ def pipe(self, func, *args, **kwargs): # Attribute access def __finalize__( - self: FrameOrSeries, other, method=None, **kwargs + self: FrameOrSeries, other, method: Optional[str] = None, **kwargs ) -> FrameOrSeries: """ Propagate metadata from other to self. @@ -5038,9 +5152,14 @@ def __finalize__( ---------- other : the object from which to get the attributes that we are going to propagate - method : optional, a passed method name ; possibly to take different - types of propagation actions based on this + method : str, optional + A passed method name providing context on where ``__finalize__`` + was called. + .. warning: + + The value passed as `method` are not currently considered + stable across pandas releases. """ if isinstance(other, NDFrame): for name in other.attrs: @@ -5058,7 +5177,6 @@ def __getattr__(self, name: str): """ # Note: obj.x will always call obj.__getattribute__('x') prior to # calling obj.__getattr__('x'). - if ( name in self._internal_names_set or name in self._metadata @@ -5128,12 +5246,12 @@ def _dir_additions(self): def _protect_consolidate(self, f): """ - Consolidate _data -- if the blocks have changed, then clear the + Consolidate _mgr -- if the blocks have changed, then clear the cache """ - blocks_before = len(self._data.blocks) + blocks_before = len(self._mgr.blocks) result = f() - if len(self._data.blocks) != blocks_before: + if len(self._mgr.blocks) != blocks_before: self._clear_item_cache() return result @@ -5141,7 +5259,7 @@ def _consolidate_inplace(self) -> None: """Consolidate data in place and return None""" def f(): - self._data = self._data.consolidate() + self._mgr = self._mgr.consolidate() self._protect_consolidate(f) @@ -5163,18 +5281,18 @@ def _consolidate(self, inplace: bool_t = False): if inplace: self._consolidate_inplace() else: - f = lambda: self._data.consolidate() + f = lambda: self._mgr.consolidate() cons_data = self._protect_consolidate(f) return self._constructor(cons_data).__finalize__(self) @property def _is_mixed_type(self) -> bool_t: - f = lambda: self._data.is_mixed_type + f = lambda: self._mgr.is_mixed_type return self._protect_consolidate(f) @property def _is_numeric_mixed_type(self) -> bool_t: - f = lambda: self._data.is_numeric_mixed_type + f = lambda: self._mgr.is_numeric_mixed_type return self._protect_consolidate(f) def _check_inplace_setting(self, value) -> bool_t: @@ -5194,10 +5312,10 @@ def _check_inplace_setting(self, value) -> bool_t: return True def _get_numeric_data(self): - return self._constructor(self._data.get_numeric_data()).__finalize__(self) + return self._constructor(self._mgr.get_numeric_data()).__finalize__(self) def _get_bool_data(self): - return self._constructor(self._data.get_bool_data()).__finalize__(self) + return self._constructor(self._mgr.get_bool_data()).__finalize__(self) # ---------------------------------------------------------------------- # Internal Interface Methods @@ -5256,7 +5374,7 @@ def values(self) -> np.ndarray: dtype: object >>> df.values array([[ 3, 94, 31], - [ 29, 170, 115]], dtype=int64) + [ 29, 170, 115]]) A DataFrame with mixed type columns(e.g., str/object, int64, float32) results in an ndarray of the broadest type that accommodates these @@ -5277,33 +5395,13 @@ def values(self) -> np.ndarray: ['monkey', nan, None]], dtype=object) """ self._consolidate_inplace() - return self._data.as_array(transpose=self._AXIS_REVERSED) + return self._mgr.as_array(transpose=self._AXIS_REVERSED) @property def _values(self) -> np.ndarray: """internal implementation""" return self.values - def _internal_get_values(self) -> np.ndarray: - """ - Return an ndarray after converting sparse values to dense. - - This is the same as ``.values`` for non-sparse data. For sparse - data contained in a `SparseArray`, the data are first - converted to a dense representation. - - Returns - ------- - numpy.ndarray - Numpy representation of DataFrame. - - See Also - -------- - values : Numpy representation of DataFrame. - SparseArray : Container for sparse data. - """ - return self.values - @property def dtypes(self): """ @@ -5334,7 +5432,7 @@ def dtypes(self): """ from pandas import Series - return Series(self._data.get_dtypes(), index=self._info_axis, dtype=np.object_) + return Series(self._mgr.get_dtypes(), index=self._info_axis, dtype=np.object_) def _to_dict_of_blocks(self, copy: bool_t = True): """ @@ -5345,7 +5443,7 @@ def _to_dict_of_blocks(self, copy: bool_t = True): """ return { k: self._constructor(v).__finalize__(self) - for k, v, in self._data.to_dict(copy=copy).items() + for k, v, in self._mgr.to_dict(copy=copy).items() } def astype( @@ -5447,6 +5545,24 @@ def astype( 0 10 1 2 dtype: int64 + + Create a series of dates: + + >>> ser_date = pd.Series(pd.date_range('20200101', periods=3)) + >>> ser_date + 0 2020-01-01 + 1 2020-01-02 + 2 2020-01-03 + dtype: datetime64[ns] + + Datetimes are localized to UTC first before + converting to the specified timezone: + + >>> ser_date.astype('datetime64[ns, US/Eastern]') + 0 2019-12-31 19:00:00-05:00 + 1 2020-01-01 19:00:00-05:00 + 2 2020-01-02 19:00:00-05:00 + dtype: datetime64[ns, US/Eastern] """ if is_dict_like(dtype): if self.ndim == 1: # i.e. Series @@ -5483,8 +5599,8 @@ def astype( else: # else, only a single dtype is given - new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors) - return self._constructor(new_data).__finalize__(self) + new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,) + return self._constructor(new_data).__finalize__(self, method="astype") # GH 19920: retain column metadata after concat result = pd.concat(results, axis=1, copy=False) @@ -5596,8 +5712,9 @@ def copy(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries: 1 [3, 4] dtype: object """ - data = self._data.copy(deep=deep) - return self._constructor(data).__finalize__(self) + data = self._mgr.copy(deep=deep) + self._clear_item_cache() + return self._constructor(data).__finalize__(self, method="copy") def __copy__(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries: return self.copy(deep=deep) @@ -5617,7 +5734,6 @@ def _convert( numeric: bool_t = False, timedelta: bool_t = False, coerce: bool_t = False, - copy: bool_t = True, ) -> FrameOrSeries: """ Attempt to infer better dtype for object columns @@ -5634,10 +5750,6 @@ def _convert( coerce : bool, default False If True, force conversion with unconvertible values converted to nulls (NaN or NaT). - copy : bool, default True - If True, return a copy even if no copy is necessary (e.g. no - conversion was done). Note: This is meant for internal use, and - should not be confused with inplace. Returns ------- @@ -5647,14 +5759,13 @@ def _convert( validate_bool_kwarg(numeric, "numeric") validate_bool_kwarg(timedelta, "timedelta") validate_bool_kwarg(coerce, "coerce") - validate_bool_kwarg(copy, "copy") return self._constructor( - self._data.convert( + self._mgr.convert( datetime=datetime, numeric=numeric, timedelta=timedelta, coerce=coerce, - copy=copy, + copy=True, ) ).__finalize__(self) @@ -5667,8 +5778,6 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries: columns unchanged. The inference rules are the same as during normal Series/DataFrame construction. - .. versionadded:: 0.21.0 - Returns ------- converted : same type as input object @@ -5702,10 +5811,10 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries: # python objects will still be converted to # native numpy numeric types return self._constructor( - self._data.convert( + self._mgr.convert( datetime=True, numeric=False, timedelta=True, coerce=False, copy=True ) - ).__finalize__(self) + ).__finalize__(self, method="infer_objects") def convert_dtypes( self: FrameOrSeries, @@ -5970,11 +6079,11 @@ def fillna( result = self.T.fillna(method=method, limit=limit).T # need to downcast here because of all of the transposes - result._data = result._data.downcast() + result._mgr = result._mgr.downcast() return result - new_data = self._data.interpolate( + new_data = self._mgr.interpolate( method=method, axis=axis, limit=limit, @@ -5983,14 +6092,13 @@ def fillna( downcast=downcast, ) else: - if len(self._get_axis(axis)) == 0: - return self - if self.ndim == 1: if isinstance(value, (dict, ABCSeries)): value = create_series_with_explicit_dtype( value, dtype_if_empty=object ) + value = value.reindex(self.index, copy=False) + value = value._values elif not is_list_like(value): pass else: @@ -6000,7 +6108,7 @@ def fillna( f'"{type(value).__name__}"' ) - new_data = self._data.fillna( + new_data = self._mgr.fillna( value=value, limit=limit, inplace=inplace, downcast=downcast ) @@ -6021,19 +6129,19 @@ def fillna( return result if not inplace else None elif not is_list_like(value): - new_data = self._data.fillna( + new_data = self._mgr.fillna( value=value, limit=limit, inplace=inplace, downcast=downcast ) elif isinstance(value, ABCDataFrame) and self.ndim == 2: - new_data = self.where(self.notna(), value) + new_data = self.where(self.notna(), value)._data else: raise ValueError(f"invalid fill value with a {type(value)}") + result = self._constructor(new_data) if inplace: - self._update_inplace(new_data) - return None + return self._update_inplace(result) else: - return self._constructor(new_data).__finalize__(self) + return result.__finalize__(self, method="fillna") def ffill( self: FrameOrSeries, @@ -6073,12 +6181,20 @@ def bfill( method="bfill", axis=axis, inplace=inplace, limit=limit, downcast=downcast ) - _shared_docs[ - "replace" - ] = """ + @doc(klass=_shared_doc_kwargs["klass"]) + def replace( + self, + to_replace=None, + value=None, + inplace=False, + limit=None, + regex=False, + method="pad", + ): + """ Replace values given in `to_replace` with `value`. - Values of the %(klass)s are replaced with other values dynamically. + Values of the {klass} are replaced with other values dynamically. This differs from updating with ``.loc`` or ``.iloc``, which require you to specify a location to update with some value. @@ -6110,19 +6226,19 @@ def bfill( - Dicts can be used to specify different replacement values for different existing values. For example, - ``{'a': 'b', 'y': 'z'}`` replaces the value 'a' with 'b' and + ``{{'a': 'b', 'y': 'z'}}`` replaces the value 'a' with 'b' and 'y' with 'z'. To use a dict in this way the `value` parameter should be `None`. - For a DataFrame a dict can specify that different values should be replaced in different columns. For example, - ``{'a': 1, 'b': 'z'}`` looks for the value 1 in column 'a' + ``{{'a': 1, 'b': 'z'}}`` looks for the value 1 in column 'a' and the value 'z' in column 'b' and replaces these values with whatever is specified in `value`. The `value` parameter should not be ``None`` in this case. You can treat this as a special case of passing two lists except that you are specifying the column to search in. - For a DataFrame nested dictionaries, e.g., - ``{'a': {'b': np.nan}}``, are read as follows: look in column + ``{{'a': {{'b': np.nan}}}}``, are read as follows: look in column 'a' for the value 'b' and replace it with NaN. The `value` parameter should be ``None`` to use a nested dict in this way. You can nest regular expressions as well. Note that @@ -6155,7 +6271,7 @@ def bfill( string. Alternatively, this could be a regular expression or a list, dict, or array of regular expressions in which case `to_replace` must be ``None``. - method : {'pad', 'ffill', 'bfill', `None`} + method : {{'pad', 'ffill', 'bfill', `None`}} The method to use when for replacement, when `to_replace` is a scalar, list or tuple and `value` is ``None``. @@ -6164,7 +6280,7 @@ def bfill( Returns ------- - %(klass)s + {klass} Object after replacement. Raises @@ -6172,7 +6288,9 @@ def bfill( AssertionError * If `regex` is not a ``bool`` and `to_replace` is not ``None``. + TypeError + * If `to_replace` is not a scalar, array-like, ``dict``, or ``None`` * If `to_replace` is a ``dict`` and `value` is not a ``list``, ``dict``, ``ndarray``, or ``Series`` * If `to_replace` is ``None`` and `regex` is not compilable @@ -6181,14 +6299,15 @@ def bfill( * When replacing multiple ``bool`` or ``datetime64`` objects and the arguments to `to_replace` does not match the type of the value being replaced + ValueError * If a ``list`` or an ``ndarray`` is passed to `to_replace` and `value` but they are not the same length. See Also -------- - %(klass)s.fillna : Fill NA values. - %(klass)s.where : Replace values based on boolean condition. + {klass}.fillna : Fill NA values. + {klass}.where : Replace values based on boolean condition. Series.str.replace : Simple string replacement. Notes @@ -6220,9 +6339,9 @@ def bfill( 4 4 dtype: int64 - >>> df = pd.DataFrame({'A': [0, 1, 2, 3, 4], + >>> df = pd.DataFrame({{'A': [0, 1, 2, 3, 4], ... 'B': [5, 6, 7, 8, 9], - ... 'C': ['a', 'b', 'c', 'd', 'e']}) + ... 'C': ['a', 'b', 'c', 'd', 'e']}}) >>> df.replace(0, 5) A B C 0 5 5 a @@ -6259,7 +6378,7 @@ def bfill( **dict-like `to_replace`** - >>> df.replace({0: 10, 1: 100}) + >>> df.replace({{0: 10, 1: 100}}) A B C 0 10 5 a 1 100 6 b @@ -6267,7 +6386,7 @@ def bfill( 3 3 8 d 4 4 9 e - >>> df.replace({'A': 0, 'B': 5}, 100) + >>> df.replace({{'A': 0, 'B': 5}}, 100) A B C 0 100 100 a 1 1 6 b @@ -6275,7 +6394,7 @@ def bfill( 3 3 8 d 4 4 9 e - >>> df.replace({'A': {0: 100, 4: 400}}) + >>> df.replace({{'A': {{0: 100, 4: 400}}}}) A B C 0 100 5 a 1 1 6 b @@ -6285,15 +6404,15 @@ def bfill( **Regular expression `to_replace`** - >>> df = pd.DataFrame({'A': ['bat', 'foo', 'bait'], - ... 'B': ['abc', 'bar', 'xyz']}) + >>> df = pd.DataFrame({{'A': ['bat', 'foo', 'bait'], + ... 'B': ['abc', 'bar', 'xyz']}}) >>> df.replace(to_replace=r'^ba.$', value='new', regex=True) A B 0 new abc 1 foo new 2 bait xyz - >>> df.replace({'A': r'^ba.$'}, {'A': 'new'}, regex=True) + >>> df.replace({{'A': r'^ba.$'}}, {{'A': 'new'}}, regex=True) A B 0 new abc 1 foo bar @@ -6305,7 +6424,7 @@ def bfill( 1 foo new 2 bait xyz - >>> df.replace(regex={r'^ba.$': 'new', 'foo': 'xyz'}) + >>> df.replace(regex={{r'^ba.$': 'new', 'foo': 'xyz'}}) A B 0 new abc 1 xyz new @@ -6321,9 +6440,9 @@ def bfill( the data types in the `to_replace` parameter must match the data type of the value being replaced: - >>> df = pd.DataFrame({'A': [True, False, True], - ... 'B': [False, True, False]}) - >>> df.replace({'a string': 'new value', True: False}) # raises + >>> df = pd.DataFrame({{'A': [True, False, True], + ... 'B': [False, True, False]}}) + >>> df.replace({{'a string': 'new value', True: False}}) # raises Traceback (most recent call last): ... TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str' @@ -6331,7 +6450,7 @@ def bfill( This raises a ``TypeError`` because one of the ``dict`` keys is not of the correct type for replacement. - Compare the behavior of ``s.replace({'a': None})`` and + Compare the behavior of ``s.replace({{'a': None}})`` and ``s.replace('a', None)`` to understand the peculiarities of the `to_replace` parameter: @@ -6339,10 +6458,10 @@ def bfill( When one uses a dict as the `to_replace` value, it is like the value(s) in the dict are equal to the `value` parameter. - ``s.replace({'a': None})`` is equivalent to - ``s.replace(to_replace={'a': None}, value=None, method=None)``: + ``s.replace({{'a': None}})`` is equivalent to + ``s.replace(to_replace={{'a': None}}, value=None, method=None)``: - >>> s.replace({'a': None}) + >>> s.replace({{'a': None}}) 0 10 1 None 2 None @@ -6365,17 +6484,17 @@ def bfill( 4 b dtype: object """ + if not ( + is_scalar(to_replace) + or is_re_compilable(to_replace) + or is_list_like(to_replace) + ): + raise TypeError( + "Expecting 'to_replace' to be either a scalar, array-like, " + "dict or None, got invalid type " + f"{repr(type(to_replace).__name__)}" + ) - @Appender(_shared_docs["replace"] % _shared_doc_kwargs) - def replace( - self, - to_replace=None, - value=None, - inplace=False, - limit=None, - regex=False, - method="pad", - ): inplace = validate_bool_kwarg(inplace, "inplace") if not is_bool(regex) and to_replace is not None: raise AssertionError("'to_replace' must be 'None' if 'regex' is not a bool") @@ -6439,36 +6558,29 @@ def replace( if not self.size: return self - new_data = self._data if is_dict_like(to_replace): if is_dict_like(value): # {'A' : NA} -> {'A' : 0} - res = self if inplace else self.copy() - for c, src in to_replace.items(): - if c in value and c in self: - # object conversion is handled in - # series.replace which is called recursively - res[c] = res[c].replace( - to_replace=src, - value=value[c], - inplace=False, - regex=regex, - ) - return None if inplace else res + # Note: Checking below for `in foo.keys()` instead of + # `in foo`is needed for when we have a Series and not dict + mapping = { + col: (to_replace[col], value[col]) + for col in to_replace.keys() + if col in value.keys() and col in self + } + return self._replace_columnwise(mapping, inplace, regex) # {'A': NA} -> 0 elif not is_list_like(value): - keys = [(k, src) for k, src in to_replace.items() if k in self] - keys_len = len(keys) - 1 - for i, (k, src) in enumerate(keys): - convert = i == keys_len - new_data = new_data.replace( - to_replace=src, - value=value, - filter=[k], - inplace=inplace, - regex=regex, - convert=convert, + # Operate column-wise + if self.ndim == 1: + raise ValueError( + "Series.replace cannot use dict-like to_replace " + "and non-None value" ) + mapping = { + col: (to_rep, value) for col, to_rep in to_replace.items() + } + return self._replace_columnwise(mapping, inplace, regex) else: raise TypeError("value argument must be scalar, dict, or Series") @@ -6480,7 +6592,7 @@ def replace( f"Expecting {len(to_replace)} got {len(value)} " ) - new_data = self._data.replace_list( + new_data = self._mgr.replace_list( src_list=to_replace, dest_list=value, inplace=inplace, @@ -6488,7 +6600,7 @@ def replace( ) else: # [NA, ''] -> 0 - new_data = self._data.replace( + new_data = self._mgr.replace( to_replace=to_replace, value=value, inplace=inplace, regex=regex ) elif to_replace is None: @@ -6509,20 +6621,17 @@ def replace( # dest iterable dict-like if is_dict_like(value): # NA -> {'A' : 0, 'B' : -1} - new_data = self._data - - for k, v in value.items(): - if k in self: - new_data = new_data.replace( - to_replace=to_replace, - value=v, - filter=[k], - inplace=inplace, - regex=regex, - ) + # Operate column-wise + if self.ndim == 1: + raise ValueError( + "Series.replace cannot use dict-value and " + "non-None to_replace" + ) + mapping = {col: (to_replace, val) for col, val in value.items()} + return self._replace_columnwise(mapping, inplace, regex) elif not is_list_like(value): # NA -> 0 - new_data = self._data.replace( + new_data = self._mgr.replace( to_replace=to_replace, value=value, inplace=inplace, regex=regex ) else: @@ -6530,10 +6639,11 @@ def replace( f'Invalid "to_replace" type: {repr(type(to_replace).__name__)}' ) + result = self._constructor(new_data) if inplace: - self._update_inplace(new_data) + return self._update_inplace(result) else: - return self._constructor(new_data).__finalize__(self) + return result.__finalize__(self, method="replace") _shared_docs[ "interpolate" @@ -6738,27 +6848,16 @@ def interpolate( axis = self._get_axis_number(axis) if axis == 0: - ax = self._info_axis_name - _maybe_transposed_self = self - elif axis == 1: - _maybe_transposed_self = self.T - ax = 1 - - ax = _maybe_transposed_self._get_axis_number(ax) - - if _maybe_transposed_self.ndim == 2: - alt_ax = 1 - ax + df = self else: - alt_ax = ax + df = self.T - if isinstance(_maybe_transposed_self.index, MultiIndex) and method != "linear": + if isinstance(df.index, MultiIndex) and method != "linear": raise ValueError( "Only `method=linear` interpolation is supported on MultiIndexes." ) - if _maybe_transposed_self._data.get_dtype_counts().get("object") == len( - _maybe_transposed_self.T - ): + if df.ndim == 2 and np.all(df.dtypes == np.dtype(object)): raise TypeError( "Cannot interpolate with all object-dtype columns " "in the DataFrame. Try setting at least one " @@ -6768,9 +6867,9 @@ def interpolate( # create/use the index if method == "linear": # prior default - index = np.arange(len(_maybe_transposed_self._get_axis(alt_ax))) + index = np.arange(len(df.index)) else: - index = _maybe_transposed_self._get_axis(alt_ax) + index = df.index methods = {"index", "values", "nearest", "time"} is_numeric_or_datetime = ( is_numeric_dtype(index) @@ -6791,12 +6890,11 @@ def interpolate( "has not been implemented. Try filling " "those NaNs before interpolating." ) - data = _maybe_transposed_self._data + data = df._mgr new_data = data.interpolate( method=method, - axis=ax, + axis=self._info_axis_number, index=index, - values=_maybe_transposed_self, limit=limit, limit_direction=limit_direction, limit_area=limit_area, @@ -6805,15 +6903,13 @@ def interpolate( **kwargs, ) + result = self._constructor(new_data) + if axis == 1: + result = result.T if inplace: - if axis == 1: - new_data = self._constructor(new_data).T._data - self._update_inplace(new_data) + return self._update_inplace(result) else: - res = self._constructor(new_data).__finalize__(self) - if axis == 1: - res = res.T - return res + return result.__finalize__(self, method="interpolate") # ---------------------------------------------------------------------- # Timeseries methods Methods @@ -6975,7 +7071,7 @@ def asof(self, where, subset=None): return Series(np.nan, index=self.columns, name=where[0]) - locs = self.index.asof_locs(where, ~(nulls.values)) + locs = self.index.asof_locs(where, ~(nulls._values)) # mask the missing missing = locs == -1 @@ -7051,11 +7147,11 @@ def asof(self, where, subset=None): @Appender(_shared_docs["isna"] % _shared_doc_kwargs) def isna(self: FrameOrSeries) -> FrameOrSeries: - return isna(self).__finalize__(self) + return isna(self).__finalize__(self, method="isna") @Appender(_shared_docs["isna"] % _shared_doc_kwargs) def isnull(self: FrameOrSeries) -> FrameOrSeries: - return isna(self).__finalize__(self) + return isna(self).__finalize__(self, method="isnull") _shared_docs[ "notna" @@ -7121,11 +7217,11 @@ def isnull(self: FrameOrSeries) -> FrameOrSeries: @Appender(_shared_docs["notna"] % _shared_doc_kwargs) def notna(self: FrameOrSeries) -> FrameOrSeries: - return notna(self).__finalize__(self) + return notna(self).__finalize__(self, method="notna") @Appender(_shared_docs["notna"] % _shared_doc_kwargs) def notnull(self: FrameOrSeries) -> FrameOrSeries: - return notna(self).__finalize__(self) + return notna(self).__finalize__(self, method="notnull") def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): if (lower is not None and np.any(isna(lower))) or ( @@ -7134,7 +7230,7 @@ def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): raise ValueError("Cannot use an NA value as a clip threshold") result = self - mask = isna(self.values) + mask = isna(self._values) with np.errstate(all="ignore"): if upper is not None: @@ -7148,7 +7244,7 @@ def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): result[mask] = np.nan if inplace: - self._update_inplace(result) + return self._update_inplace(result) else: return result @@ -7203,8 +7299,6 @@ def clip( Align object with lower and upper along the given axis. inplace : bool, default False Whether to perform the operation in place on the data. - - .. versionadded:: 0.21.0 *args, **kwargs Additional keywords have no effect but might be accepted for compatibility with numpy. @@ -7215,6 +7309,12 @@ def clip( Same type as calling object with the values outside the clip boundaries replaced. + See Also + -------- + Series.clip : Trim values at input threshold in series. + DataFrame.clip : Trim values at input threshold in dataframe. + numpy.clip : Clip (limit) the values in an array. + Examples -------- >>> data = {'col_0': [9, -3, 0, -1, 5], 'col_1': [-2, -7, 6, 8, -5]} @@ -7379,6 +7479,7 @@ def asfreq( Parameters ---------- freq : DateOffset or str + Frequency DateOffset or string. method : {'backfill'/'bfill', 'pad'/'ffill'}, default None Method to use for filling holes in reindexed Series (note this does not fill NaNs that already were present): @@ -7396,11 +7497,12 @@ def asfreq( Returns ------- - converted : same type as caller + Same type as caller + Object converted to the specified frequency. See Also -------- - reindex + reindex : Conform DataFrame to new index with optional filling logic. Notes ----- @@ -7521,8 +7623,8 @@ def at_time( index = self._get_axis(axis) try: indexer = index.indexer_at_time(time, asof=asof) - except AttributeError: - raise TypeError("Index must be DatetimeIndex") + except AttributeError as err: + raise TypeError("Index must be DatetimeIndex") from err return self._take_with_is_copy(indexer, axis=axis) @@ -7609,8 +7711,8 @@ def between_time( include_start=include_start, include_end=include_end, ) - except AttributeError: - raise TypeError("Index must be DatetimeIndex") + except AttributeError as err: + raise TypeError("Index must be DatetimeIndex") from err return self._take_with_is_copy(indexer, axis=axis) @@ -7908,15 +8010,21 @@ def resample( def first(self: FrameOrSeries, offset) -> FrameOrSeries: """ - Method to subset initial periods of time series data based on a date offset. + Select initial periods of time series data based on a date offset. + + When having a DataFrame with dates as index, this function can + select the first few rows based on a date offset. Parameters ---------- - offset : str, DateOffset, dateutil.relativedelta + offset : str, DateOffset or dateutil.relativedelta + The offset length of the data that will be selected. For instance, + '1M' will display all the rows having their index within the first month. Returns ------- - subset : same type as caller + Series or DataFrame + A subset of the caller. Raises ------ @@ -7932,7 +8040,7 @@ def first(self: FrameOrSeries, offset) -> FrameOrSeries: Examples -------- >>> i = pd.date_range('2018-04-09', periods=4, freq='2D') - >>> ts = pd.DataFrame({'A': [1,2,3,4]}, index=i) + >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) >>> ts A 2018-04-09 1 @@ -7947,7 +8055,7 @@ def first(self: FrameOrSeries, offset) -> FrameOrSeries: 2018-04-09 1 2018-04-11 2 - Notice the data for 3 first calender days were returned, not the first + Notice the data for 3 first calendar days were returned, not the first 3 days observed in the dataset, and therefore data for 2018-04-13 was not returned. """ @@ -7970,15 +8078,21 @@ def first(self: FrameOrSeries, offset) -> FrameOrSeries: def last(self: FrameOrSeries, offset) -> FrameOrSeries: """ - Method to subset final periods of time series data based on a date offset. + Select final periods of time series data based on a date offset. + + When having a DataFrame with dates as index, this function can + select the last few rows based on a date offset. Parameters ---------- offset : str, DateOffset, dateutil.relativedelta + The offset length of the data that will be selected. For instance, + '3D' will display all the rows having their index within the last 3 days. Returns ------- - subset : same type as caller + Series or DataFrame + A subset of the caller. Raises ------ @@ -8009,7 +8123,7 @@ def last(self: FrameOrSeries, offset) -> FrameOrSeries: 2018-04-13 3 2018-04-15 4 - Notice the data for 3 last calender days were returned, not the last + Notice the data for 3 last calendar days were returned, not the last 3 observed days in the dataset, and therefore data for 2018-04-11 was not returned. """ @@ -8131,7 +8245,7 @@ def ranker(data): pct=pct, ) ranks = self._constructor(ranks, **data._construct_axes_dict()) - return ranks.__finalize__(self) + return ranks.__finalize__(self, method="rank") # if numeric_only is None, and we can't get anything, we try with # numeric_only=True @@ -8148,9 +8262,21 @@ def ranker(data): return ranker(data) - _shared_docs[ - "align" - ] = """ + @doc(**_shared_doc_kwargs) + def align( + self, + other, + join="outer", + axis=None, + level=None, + copy=True, + fill_value=None, + method=None, + limit=None, + fill_axis=0, + broadcast_axis=None, + ): + """ Align two objects on their axes with the specified join method. Join method is specified for each axis Index. @@ -8158,7 +8284,7 @@ def ranker(data): Parameters ---------- other : DataFrame or Series - join : {'outer', 'inner', 'left', 'right'}, default 'outer' + join : {{'outer', 'inner', 'left', 'right'}}, default 'outer' axis : allowed axis of the other object, default None Align on index (0), columns (1), or both (None). level : int or level name, default None @@ -8170,7 +8296,7 @@ def ranker(data): fill_value : scalar, default np.NaN Value to use for missing values. Defaults to NaN, but can be any "compatible" value. - method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + method : {{'backfill', 'bfill', 'pad', 'ffill', None}}, default None Method to use for filling holes in reindexed Series: - pad / ffill: propagate last valid observation forward to next valid. @@ -8183,32 +8309,18 @@ def ranker(data): be partially filled. If method is not specified, this is the maximum number of entries along the entire axis where NaNs will be filled. Must be greater than 0 if not None. - fill_axis : %(axes_single_arg)s, default 0 + fill_axis : {axes_single_arg}, default 0 Filling axis, method and limit. - broadcast_axis : %(axes_single_arg)s, default None + broadcast_axis : {axes_single_arg}, default None Broadcast values along this axis, if aligning two objects of different dimensions. Returns ------- - (left, right) : (%(klass)s, type of other) + (left, right) : ({klass}, type of other) Aligned objects. """ - @Appender(_shared_docs["align"] % _shared_doc_kwargs) - def align( - self, - other, - join="outer", - axis=None, - level=None, - copy=True, - fill_value=None, - method=None, - limit=None, - fill_axis=0, - broadcast_axis=None, - ): method = missing.clean_fill_method(method) if broadcast_axis == 1 and self.ndim != other.ndim: @@ -8326,9 +8438,9 @@ def _align_frame( ) if method is not None: - left = self._ensure_type( - left.fillna(method=method, axis=fill_axis, limit=limit) - ) + _left = left.fillna(method=method, axis=fill_axis, limit=limit) + assert _left is not None # needed for mypy + left = _left right = right.fillna(method=method, axis=fill_axis, limit=limit) # if DatetimeIndex have different tz, convert to UTC @@ -8338,7 +8450,10 @@ def _align_frame( left.index = join_index right.index = join_index - return left.__finalize__(self), right.__finalize__(other) + return ( + left.__finalize__(self), + right.__finalize__(other), + ) def _align_series( self, @@ -8373,7 +8488,7 @@ def _align_series( else: # one has > 1 ndim - fdata = self._data + fdata = self._mgr if axis == 0: join_index = self.index lidx, ridx = None, None @@ -8398,7 +8513,7 @@ def _align_series( else: raise ValueError("Must specify axis=0 or 1") - if copy and fdata is self._data: + if copy and fdata is self._mgr: fdata = fdata.copy() left = self._constructor(fdata) @@ -8422,7 +8537,10 @@ def _align_series( left.index = join_index right.index = join_index - return left.__finalize__(self), right.__finalize__(other) + return ( + left.__finalize__(self), + right.__finalize__(other), + ) def _where( self, @@ -8465,12 +8583,15 @@ def _where( for dt in cond.dtypes: if not is_bool_dtype(dt): raise ValueError(msg.format(dtype=dt)) + else: + # GH#21947 we have an empty DataFrame, could be object-dtype + cond = cond.astype(bool) cond = -cond if inplace else cond # try to align with other try_quick = True - if hasattr(other, "align"): + if isinstance(other, NDFrame): # align with me if other.ndim <= self.ndim: @@ -8497,12 +8618,12 @@ def _where( if self.ndim == 1: - icond = cond.values + icond = cond._values # GH 2745 / GH 4192 # treat like a scalar if len(other) == 1: - other = np.array(other[0]) + other = other[0] # GH 3235 # match True cond to other @@ -8510,7 +8631,7 @@ def _where( # try to not change dtype at first (if try_quick) if try_quick: - new_other = com.values_from_object(self) + new_other = np.asarray(self) new_other = new_other.copy() new_other[icond] = other other = new_other @@ -8544,18 +8665,14 @@ def _where( # reconstruct the block manager self._check_inplace_setting(other) - new_data = self._data.putmask( - mask=cond, - new=other, - align=align, - inplace=True, - axis=block_axis, - transpose=self._AXIS_REVERSED, + new_data = self._mgr.putmask( + mask=cond, new=other, align=align, axis=block_axis, ) - self._update_inplace(new_data) + result = self._constructor(new_data) + return self._update_inplace(result) else: - new_data = self._data.where( + new_data = self._mgr.where( other=other, cond=cond, align=align, @@ -8563,8 +8680,8 @@ def _where( try_cast=try_cast, axis=block_axis, ) - - return self._constructor(new_data).__finalize__(self) + result = self._constructor(new_data) + return result.__finalize__(self) _shared_docs[ "where" @@ -8747,9 +8864,11 @@ def mask( errors=errors, ) - _shared_docs[ - "shift" - ] = """ + @doc(klass=_shared_doc_kwargs["klass"]) + def shift( + self: FrameOrSeries, periods=1, freq=None, axis=0, fill_value=None + ) -> FrameOrSeries: + """ Shift index by desired number of periods with an optional time `freq`. When `freq` is not passed, shift the index without realigning the data. @@ -8766,7 +8885,7 @@ def mask( If `freq` is specified then the index values are shifted but the data is not realigned. That is, use `freq` if you would like to extend the index when shifting and preserve the original data. - axis : {0 or 'index', 1 or 'columns', None}, default None + axis : {{0 or 'index', 1 or 'columns', None}}, default None Shift direction. fill_value : object, optional The scalar value to use for newly introduced missing values. @@ -8779,7 +8898,7 @@ def mask( Returns ------- - %(klass)s + {klass} Copy of input object, shifted. See Also @@ -8792,9 +8911,9 @@ def mask( Examples -------- - >>> df = pd.DataFrame({'Col1': [10, 20, 15, 30, 45], + >>> df = pd.DataFrame({{'Col1': [10, 20, 15, 30, 45], ... 'Col2': [13, 23, 18, 33, 48], - ... 'Col3': [17, 27, 22, 37, 52]}) + ... 'Col3': [17, 27, 22, 37, 52]}}) >>> df.shift(periods=3) Col1 Col2 Col3 @@ -8819,24 +8938,19 @@ def mask( 2 0 0 0 3 10 13 17 4 20 23 27 - """ - - @Appender(_shared_docs["shift"] % _shared_doc_kwargs) - def shift( - self: FrameOrSeries, periods=1, freq=None, axis=0, fill_value=None - ) -> FrameOrSeries: + """ if periods == 0: return self.copy() block_axis = self._get_block_manager_axis(axis) if freq is None: - new_data = self._data.shift( + new_data = self._mgr.shift( periods=periods, axis=block_axis, fill_value=fill_value ) else: return self.tshift(periods, freq) - return self._constructor(new_data).__finalize__(self) + return self._constructor(new_data).__finalize__(self, method="shift") def slice_shift(self: FrameOrSeries, periods: int = 1, axis=0) -> FrameOrSeries: """ @@ -8873,10 +8987,10 @@ def slice_shift(self: FrameOrSeries, periods: int = 1, axis=0) -> FrameOrSeries: shifted_axis = self._get_axis(axis)[islicer] new_obj.set_axis(shifted_axis, axis=axis, inplace=True) - return new_obj.__finalize__(self) + return new_obj.__finalize__(self, method="slice_shift") def tshift( - self: FrameOrSeries, periods: int = 1, freq=None, axis=0 + self: FrameOrSeries, periods: int = 1, freq=None, axis: Axis = 0 ) -> FrameOrSeries: """ Shift the time index, using the index's frequency if available. @@ -8918,22 +9032,22 @@ def tshift( if isinstance(freq, str): freq = to_offset(freq) - block_axis = self._get_block_manager_axis(axis) + axis = self._get_axis_number(axis) if isinstance(index, PeriodIndex): orig_freq = to_offset(index.freq) - if freq == orig_freq: - new_data = self._data.copy() - new_data.axes[block_axis] = index.shift(periods) - elif orig_freq is not None: + if freq != orig_freq: + assert orig_freq is not None # for mypy raise ValueError( f"Given freq {freq.rule_code} does not match " f"PeriodIndex freq {orig_freq.rule_code}" ) + new_ax = index.shift(periods) else: - new_data = self._data.copy() - new_data.axes[block_axis] = index.shift(periods, freq) + new_ax = index.shift(periods, freq) - return self._constructor(new_data).__finalize__(self) + result = self.copy() + result.set_axis(new_ax, axis, inplace=True) + return result.__finalize__(self, method="tshift") def truncate( self: FrameOrSeries, before=None, after=None, axis=None, copy: bool_t = True @@ -9142,9 +9256,9 @@ def _tz_convert(ax, tz): raise ValueError(f"The level {level} is not valid") ax = _tz_convert(ax, tz) - result = self._constructor(self._data, copy=copy) + result = self.copy(deep=copy) result = result.set_axis(ax, axis=axis, inplace=False) - return result.__finalize__(self) + return result.__finalize__(self, method="tz_convert") def tz_localize( self: FrameOrSeries, @@ -9311,9 +9425,9 @@ def _tz_localize(ax, tz, ambiguous, nonexistent): raise ValueError(f"The level {level} is not valid") ax = _tz_localize(ax, tz, ambiguous, nonexistent) - result = self._constructor(self._data, copy=copy) + result = self.copy(deep=copy) result = result.set_axis(ax, axis=axis, inplace=False) - return result.__finalize__(self) + return result.__finalize__(self, method="tz_localize") # ---------------------------------------------------------------------- # Numeric Methods @@ -9511,12 +9625,13 @@ def describe( ... np.datetime64("2010-01-01") ... ]) >>> s.describe() - count 3 - unique 2 - top 2010-01-01 00:00:00 - freq 2 - first 2000-01-01 00:00:00 - last 2010-01-01 00:00:00 + count 3 + mean 2006-09-01 08:00:00 + min 2000-01-01 00:00:00 + 25% 2004-12-31 12:00:00 + 50% 2010-01-01 00:00:00 + 75% 2010-01-01 00:00:00 + max 2010-01-01 00:00:00 dtype: object Describing a ``DataFrame``. By default only numeric fields @@ -9539,11 +9654,11 @@ def describe( Describing all columns of a ``DataFrame`` regardless of data type. - >>> df.describe(include='all') - categorical numeric object + >>> df.describe(include='all') # doctest: +SKIP + categorical numeric object count 3 3.0 3 unique 3 NaN 3 - top f NaN c + top f NaN a freq 1 NaN 1 mean NaN 2.0 NaN std NaN 1.0 NaN @@ -9582,11 +9697,11 @@ def describe( Including only string columns in a ``DataFrame`` description. - >>> df.describe(include=[np.object]) + >>> df.describe(include=[np.object]) # doctest: +SKIP object count 3 unique 3 - top c + top a freq 1 Including only categorical columns from a ``DataFrame`` description. @@ -9600,16 +9715,16 @@ def describe( Excluding numeric columns from a ``DataFrame`` description. - >>> df.describe(exclude=[np.number]) + >>> df.describe(exclude=[np.number]) # doctest: +SKIP categorical object count 3 3 unique 3 3 - top f c + top f a freq 1 1 Excluding object columns from a ``DataFrame`` description. - >>> df.describe(exclude=[np.object]) + >>> df.describe(exclude=[np.object]) # doctest: +SKIP categorical numeric count 3 3.0 unique 3 NaN @@ -9718,7 +9833,7 @@ def describe_1d(data): ldesc = [describe_1d(s) for _, s in data.items()] # set a convenient order for rows - names: List[Optional[Hashable]] = [] + names: List[Label] = [] ldesc_indexes = sorted((x.index for x in ldesc), key=len) for idxnames in ldesc_indexes: for name in idxnames: @@ -9860,9 +9975,9 @@ def pct_change( if fill_method is None: data = self else: - data = self._ensure_type( - self.fillna(method=fill_method, axis=axis, limit=limit) - ) + _data = self.fillna(method=fill_method, axis=axis, limit=limit) + assert _data is not None # needed for mypy + data = _data rs = data.div(data.shift(periods=periods, freq=freq, axis=axis, **kwargs)) - 1 if freq is not None: @@ -9888,37 +10003,37 @@ def _add_numeric_operations(cls): """ Add the operations to the cls; evaluate the doc strings again """ - axis_descr, name, name2 = _doc_parms(cls) + axis_descr, name1, name2 = _doc_parms(cls) cls.any = _make_logical_function( cls, "any", - name, - name2, - axis_descr, - _any_desc, - nanops.nanany, - _any_see_also, - _any_examples, + name1=name1, + name2=name2, + axis_descr=axis_descr, + desc=_any_desc, + func=nanops.nanany, + see_also=_any_see_also, + examples=_any_examples, empty_value=False, ) cls.all = _make_logical_function( cls, "all", - name, - name2, - axis_descr, - _all_desc, - nanops.nanall, - _all_see_also, - _all_examples, + name1=name1, + name2=name2, + axis_descr=axis_descr, + desc=_all_desc, + func=nanops.nanall, + see_also=_all_see_also, + examples=_all_examples, empty_value=True, ) @Substitution( desc="Return the mean absolute deviation of the values " "for the requested axis.", - name1=name, + name1=name1, name2=name2, axis_descr=axis_descr, min_count="", @@ -9946,177 +10061,169 @@ def mad(self, axis=None, skipna=None, level=None): cls.sem = _make_stat_function_ddof( cls, "sem", - name, - name2, - axis_descr, - "Return unbiased standard error of the mean over requested " + name1=name1, + name2=name2, + axis_descr=axis_descr, + desc="Return unbiased standard error of the mean over requested " "axis.\n\nNormalized by N-1 by default. This can be changed " "using the ddof argument", - nanops.nansem, + func=nanops.nansem, ) cls.var = _make_stat_function_ddof( cls, "var", - name, - name2, - axis_descr, - "Return unbiased variance over requested axis.\n\nNormalized by " + name1=name1, + name2=name2, + axis_descr=axis_descr, + desc="Return unbiased variance over requested axis.\n\nNormalized by " "N-1 by default. This can be changed using the ddof argument", - nanops.nanvar, + func=nanops.nanvar, ) cls.std = _make_stat_function_ddof( cls, "std", - name, - name2, - axis_descr, - "Return sample standard deviation over requested axis." + name1=name1, + name2=name2, + axis_descr=axis_descr, + desc="Return sample standard deviation over requested axis." "\n\nNormalized by N-1 by default. This can be changed using the " "ddof argument", - nanops.nanstd, + func=nanops.nanstd, ) cls.cummin = _make_cum_function( cls, "cummin", - name, - name2, - axis_descr, - "minimum", - np.minimum.accumulate, - "min", - np.inf, - np.nan, - _cummin_examples, + name1=name1, + name2=name2, + axis_descr=axis_descr, + desc="minimum", + accum_func=np.minimum.accumulate, + accum_func_name="min", + examples=_cummin_examples, ) cls.cumsum = _make_cum_function( cls, "cumsum", - name, - name2, - axis_descr, - "sum", - np.cumsum, - "sum", - 0.0, - np.nan, - _cumsum_examples, + name1=name1, + name2=name2, + axis_descr=axis_descr, + desc="sum", + accum_func=np.cumsum, + accum_func_name="sum", + examples=_cumsum_examples, ) cls.cumprod = _make_cum_function( cls, "cumprod", - name, - name2, - axis_descr, - "product", - np.cumprod, - "prod", - 1.0, - np.nan, - _cumprod_examples, + name1=name1, + name2=name2, + axis_descr=axis_descr, + desc="product", + accum_func=np.cumprod, + accum_func_name="prod", + examples=_cumprod_examples, ) cls.cummax = _make_cum_function( cls, "cummax", - name, - name2, - axis_descr, - "maximum", - np.maximum.accumulate, - "max", - -np.inf, - np.nan, - _cummax_examples, + name1=name1, + name2=name2, + axis_descr=axis_descr, + desc="maximum", + accum_func=np.maximum.accumulate, + accum_func_name="max", + examples=_cummax_examples, ) cls.sum = _make_min_count_stat_function( cls, "sum", - name, - name2, - axis_descr, - """Return the sum of the values for the requested axis.\n - This is equivalent to the method ``numpy.sum``.""", - nanops.nansum, - _stat_func_see_also, - _sum_examples, + name1=name1, + name2=name2, + axis_descr=axis_descr, + desc="Return the sum of the values for the requested axis.\n\n" + "This is equivalent to the method ``numpy.sum``.", + func=nanops.nansum, + see_also=_stat_func_see_also, + examples=_sum_examples, ) cls.mean = _make_stat_function( cls, "mean", - name, - name2, - axis_descr, - "Return the mean of the values for the requested axis.", - nanops.nanmean, + name1=name1, + name2=name2, + axis_descr=axis_descr, + desc="Return the mean of the values for the requested axis.", + func=nanops.nanmean, ) cls.skew = _make_stat_function( cls, "skew", - name, - name2, - axis_descr, - "Return unbiased skew over requested axis.\n\nNormalized by N-1.", - nanops.nanskew, + name1=name1, + name2=name2, + axis_descr=axis_descr, + desc="Return unbiased skew over requested axis.\n\nNormalized by N-1.", + func=nanops.nanskew, ) cls.kurt = _make_stat_function( cls, "kurt", - name, - name2, - axis_descr, - "Return unbiased kurtosis over requested axis.\n\n" + name1=name1, + name2=name2, + axis_descr=axis_descr, + desc="Return unbiased kurtosis over requested axis.\n\n" "Kurtosis obtained using Fisher's definition of\n" "kurtosis (kurtosis of normal == 0.0). Normalized " "by N-1.", - nanops.nankurt, + func=nanops.nankurt, ) cls.kurtosis = cls.kurt cls.prod = _make_min_count_stat_function( cls, "prod", - name, - name2, - axis_descr, - "Return the product of the values for the requested axis.", - nanops.nanprod, + name1=name1, + name2=name2, + axis_descr=axis_descr, + desc="Return the product of the values for the requested axis.", + func=nanops.nanprod, examples=_prod_examples, ) cls.product = cls.prod cls.median = _make_stat_function( cls, "median", - name, - name2, - axis_descr, - "Return the median of the values for the requested axis.", - nanops.nanmedian, + name1=name1, + name2=name2, + axis_descr=axis_descr, + desc="Return the median of the values for the requested axis.", + func=nanops.nanmedian, ) cls.max = _make_stat_function( cls, "max", - name, - name2, - axis_descr, - """Return the maximum of the values for the requested axis.\n - If you want the *index* of the maximum, use ``idxmax``. This is - the equivalent of the ``numpy.ndarray`` method ``argmax``.""", - nanops.nanmax, - _stat_func_see_also, - _max_examples, + name1=name1, + name2=name2, + axis_descr=axis_descr, + desc="Return the maximum of the values for the requested axis.\n\n" + "If you want the *index* of the maximum, use ``idxmax``. This is" + "the equivalent of the ``numpy.ndarray`` method ``argmax``.", + func=nanops.nanmax, + see_also=_stat_func_see_also, + examples=_max_examples, ) cls.min = _make_stat_function( cls, "min", - name, - name2, - axis_descr, - """Return the minimum of the values for the requested axis.\n - If you want the *index* of the minimum, use ``idxmin``. This is - the equivalent of the ``numpy.ndarray`` method ``argmin``.""", - nanops.nanmin, - _stat_func_see_also, - _min_examples, + name1=name1, + name2=name2, + axis_descr=axis_descr, + desc="Return the minimum of the values for the requested axis.\n\n" + "If you want the *index* of the minimum, use ``idxmin``. This is" + "the equivalent of the ``numpy.ndarray`` method ``argmin``.", + func=nanops.nanmin, + see_also=_stat_func_see_also, + examples=_min_examples, ) @classmethod @@ -10127,7 +10234,7 @@ def _add_series_or_dataframe_operations(cls): """ from pandas.core.window import EWM, Expanding, Rolling, Window - @Appender(Rolling.__doc__) + @doc(Rolling) def rolling( self, window, @@ -10165,14 +10272,14 @@ def rolling( cls.rolling = rolling - @Appender(Expanding.__doc__) + @doc(Expanding) def expanding(self, min_periods=1, center=False, axis=0): axis = self._get_axis_number(axis) return Expanding(self, min_periods=min_periods, center=center, axis=axis) cls.expanding = expanding - @Appender(EWM.__doc__) + @doc(EWM) def ewm( self, com=None, @@ -10446,13 +10553,14 @@ def _doc_parms(cls): skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. -*args, **kwargs : +*args, **kwargs Additional keywords have no effect but might be accepted for compatibility with NumPy. Returns ------- %(name1)s or %(name2)s + Return cumulative %(desc)s of %(name1)s or %(name2)s. See Also -------- @@ -10936,8 +11044,16 @@ def _doc_parms(cls): def _make_min_count_stat_function( - cls, name, name1, name2, axis_descr, desc, f, see_also: str = "", examples: str = "" -): + cls, + name: str, + name1: str, + name2: str, + axis_descr: str, + desc: str, + func: Callable, + see_also: str = "", + examples: str = "", +) -> Callable: @Substitution( desc=desc, name1=name1, @@ -10972,8 +11088,8 @@ def stat_func( name, axis=axis, level=level, skipna=skipna, min_count=min_count ) return self._reduce( - f, - name, + func, + name=name, axis=axis, skipna=skipna, numeric_only=numeric_only, @@ -10984,8 +11100,16 @@ def stat_func( def _make_stat_function( - cls, name, name1, name2, axis_descr, desc, f, see_also: str = "", examples: str = "" -): + cls, + name: str, + name1: str, + name2: str, + axis_descr: str, + desc: str, + func: Callable, + see_also: str = "", + examples: str = "", +) -> Callable: @Substitution( desc=desc, name1=name1, @@ -11010,13 +11134,15 @@ def stat_func( if level is not None: return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) return self._reduce( - f, name, axis=axis, skipna=skipna, numeric_only=numeric_only + func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only ) return set_function_name(stat_func, name, cls) -def _make_stat_function_ddof(cls, name, name1, name2, axis_descr, desc, f): +def _make_stat_function_ddof( + cls, name: str, name1: str, name2: str, axis_descr: str, desc: str, func: Callable +) -> Callable: @Substitution(desc=desc, name1=name1, name2=name2, axis_descr=axis_descr) @Appender(_num_ddof_doc) def stat_func( @@ -11032,7 +11158,7 @@ def stat_func( name, axis=axis, level=level, skipna=skipna, ddof=ddof ) return self._reduce( - f, name, axis=axis, numeric_only=numeric_only, skipna=skipna, ddof=ddof + func, name, axis=axis, numeric_only=numeric_only, skipna=skipna, ddof=ddof ) return set_function_name(stat_func, name, cls) @@ -11040,17 +11166,15 @@ def stat_func( def _make_cum_function( cls, - name, - name1, - name2, - axis_descr, - desc, - accum_func, - accum_func_name, - mask_a, - mask_b, - examples, -): + name: str, + name1: str, + name2: str, + axis_descr: str, + desc: str, + accum_func: Callable, + accum_func_name: str, + examples: str, +) -> Callable: @Substitution( desc=desc, name1=name1, @@ -11070,72 +11194,35 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): if axis == 1: return cum_func(self.T, axis=0, skipna=skipna, *args, **kwargs).T - def na_accum_func(blk_values): - # We will be applying this function to block values - if blk_values.dtype.kind in ["m", "M"]: - # GH#30460, GH#29058 - # numpy 1.18 started sorting NaTs at the end instead of beginning, - # so we need to work around to maintain backwards-consistency. - orig_dtype = blk_values.dtype - - # We need to define mask before masking NaTs - mask = isna(blk_values) - - if accum_func == np.minimum.accumulate: - # Note: the accum_func comparison fails as an "is" comparison - y = blk_values.view("i8") - y[mask] = np.iinfo(np.int64).max - changed = True - else: - y = blk_values - changed = False - - result = accum_func(y.view("i8"), axis) - if skipna: - np.putmask(result, mask, iNaT) - elif accum_func == np.minimum.accumulate: - # Restore NaTs that we masked previously - nz = (~np.asarray(mask)).nonzero()[0] - if len(nz): - # everything up to the first non-na entry stays NaT - result[: nz[0]] = iNaT - - if changed: - # restore NaT elements - y[mask] = iNaT # TODO: could try/finally for this? - - if isinstance(blk_values, np.ndarray): - result = result.view(orig_dtype) - else: - # DatetimeArray - result = type(blk_values)._from_sequence(result, dtype=orig_dtype) - - elif skipna and not issubclass( - blk_values.dtype.type, (np.integer, np.bool_) - ): - vals = blk_values.copy().T - mask = isna(vals) - np.putmask(vals, mask, mask_a) - result = accum_func(vals, axis) - np.putmask(result, mask, mask_b) - else: - result = accum_func(blk_values.T, axis) + def block_accum_func(blk_values): + values = blk_values.T if hasattr(blk_values, "T") else blk_values - # transpose back for ndarray, not for EA - return result.T if hasattr(result, "T") else result + result = nanops.na_accum_func(values, accum_func, skipna=skipna) - result = self._data.apply(na_accum_func) + result = result.T if hasattr(result, "T") else result + return result + + result = self._mgr.apply(block_accum_func) d = self._construct_axes_dict() d["copy"] = False - return self._constructor(result, **d).__finalize__(self) + return self._constructor(result, **d).__finalize__(self, method=name) return set_function_name(cum_func, name, cls) def _make_logical_function( - cls, name, name1, name2, axis_descr, desc, f, see_also, examples, empty_value -): + cls, + name: str, + name1: str, + name2: str, + axis_descr: str, + desc: str, + func: Callable, + see_also: str, + examples: str, + empty_value: bool, +) -> Callable: @Substitution( desc=desc, name1=name1, @@ -11155,8 +11242,8 @@ def logical_func(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs ) return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) return self._reduce( - f, - name, + func, + name=name, axis=axis, skipna=skipna, numeric_only=bool_only, diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 700d8d503d086..363286704ba95 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -98,6 +98,7 @@ def _gotitem(self, key, ndim, subset=None): [ "all", "any", + "corrwith", "count", "first", "idxmax", @@ -132,7 +133,6 @@ def _gotitem(self, key, ndim, subset=None): [ "backfill", "bfill", - "corrwith", "cumcount", "cummax", "cummin", diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index c71ebee397bbd..db734bb2f0c07 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -4,7 +4,7 @@ from pandas.core.arrays.categorical import ( Categorical, CategoricalDtype, - _recode_for_categories, + recode_for_categories, ) @@ -51,7 +51,7 @@ def recode_for_groupby(c: Categorical, sort: bool, observed: bool): # we recode according to the uniques categories = c.categories.take(take_codes) - codes = _recode_for_categories(c.codes, c.categories, categories) + codes = recode_for_categories(c.codes, c.categories, categories) # return a new categorical that maps our new codes # and categories diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1bb512aee39e2..13938c41a0f6b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -29,11 +29,13 @@ import numpy as np -from pandas._libs import Timestamp, lib +from pandas._libs import lib from pandas._typing import FrameOrSeries -from pandas.util._decorators import Appender, Substitution +from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.cast import ( + maybe_cast_result, + maybe_cast_result_dtype, maybe_convert_objects, maybe_downcast_numeric, maybe_downcast_to_dtype, @@ -49,7 +51,7 @@ is_scalar, needs_i8_conversion, ) -from pandas.core.dtypes.missing import _isna_ndarraylike, isna, notna +from pandas.core.dtypes.missing import isna, notna from pandas.core.aggregation import ( is_multi_agg_with_relabel, @@ -149,7 +151,7 @@ def pinner(cls): @pin_whitelisted_properties(Series, base.series_apply_whitelist) -class SeriesGroupBy(GroupBy): +class SeriesGroupBy(GroupBy[Series]): _apply_whitelist = base.series_apply_whitelist def _iterate_slices(self) -> Iterable[Series]: @@ -386,7 +388,7 @@ def _wrap_aggregated_output( result = self._wrap_series_output( output=output, index=self.grouper.result_index ) - return self._reindex_output(result)._convert(datetime=True) + return self._reindex_output(result) def _wrap_transformed_output( self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] @@ -526,7 +528,7 @@ def _transform_fast(self, result, func_nm: str) -> Series: cast = self._transform_should_cast(func_nm) out = algorithms.take_1d(result._values, ids) if cast: - out = self._try_cast(out, self.obj) + out = maybe_cast_result(out, self.obj, how=func_nm) return Series(out, index=self.obj.index, name=self.obj.name) def filter(self, func, dropna=True, *args, **kwargs): @@ -572,8 +574,8 @@ def true_and_notna(x, *args, **kwargs) -> bool: indices = [ self._get_index(name) for name, group in self if true_and_notna(group) ] - except (ValueError, TypeError): - raise TypeError("the filter must return a boolean result") + except (ValueError, TypeError) as err: + raise TypeError("the filter must return a boolean result") from err filtered = self._apply_filter(indices, dropna) return filtered @@ -589,7 +591,7 @@ def nunique(self, dropna: bool = True) -> Series: """ ids, _, _ = self.grouper.group_info - val = self.obj._internal_get_values() + val = self.obj._values codes, _ = algorithms.factorize(val, sort=False) sorter = np.lexsort((codes, ids)) @@ -631,7 +633,7 @@ def nunique(self, dropna: bool = True) -> Series: result = Series(res, index=ri, name=self._selection_name) return self._reindex_output(result, fill_value=0) - @Appender(Series.describe.__doc__) + @doc(Series.describe) def describe(self, **kwargs): result = self.apply(lambda x: x.describe(**kwargs)) if self.axis == 1: @@ -657,7 +659,7 @@ def value_counts( ) ids, _, _ = self.grouper.group_info - val = self.obj._internal_get_values() + val = self.obj._values # groupby removes null keys from groupings mask = ids != -1 @@ -774,7 +776,7 @@ def count(self) -> Series: Count of values within each group. """ ids, _, ngroups = self.grouper.group_info - val = self.obj._internal_get_values() + val = self.obj._values mask = (ids != -1) & ~isna(val) ids = ensure_platform_int(ids) @@ -813,7 +815,7 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None): @pin_whitelisted_properties(DataFrame, base.dataframe_apply_whitelist) -class DataFrameGroupBy(GroupBy): +class DataFrameGroupBy(GroupBy[DataFrame]): _apply_whitelist = base.dataframe_apply_whitelist @@ -831,10 +833,13 @@ class DataFrameGroupBy(GroupBy): """ Examples -------- - - >>> df = pd.DataFrame({'A': [1, 1, 2, 2], - ... 'B': [1, 2, 3, 4], - ... 'C': np.random.randn(4)}) + >>> df = pd.DataFrame( + ... { + ... "A": [1, 1, 2, 2], + ... "B": [1, 2, 3, 4], + ... "C": [0.362838, 0.227877, 1.267767, -0.562860], + ... } + ... ) >>> df A B C @@ -874,7 +879,7 @@ class DataFrameGroupBy(GroupBy): B C min max sum A - 1 1 2 0.590716 + 1 1 2 0.590715 2 3 4 0.704907 To control the output names with different aggregations per column, @@ -885,8 +890,9 @@ class DataFrameGroupBy(GroupBy): ... c_sum=pd.NamedAgg(column="C", aggfunc="sum")) b_min c_sum A - 1 1 -1.956929 - 2 3 -0.322183 + 1 1 0.590715 + 2 3 0.704907 + - The keywords are the *output* column names - The values are tuples whose first element is the column to select @@ -955,9 +961,11 @@ def aggregate(self, func=None, *args, **kwargs): raise result = self._aggregate_frame(func) else: - result.columns = Index( - result.columns.levels[0], name=self._selected_obj.columns.name - ) + # select everything except for the last level, which is the one + # containing the name of the function(s), see GH 32040 + result.columns = result.columns.rename( + [self._selected_obj.columns.name] * result.columns.nlevels + ).droplevel(-1) if not self.as_index: self._insert_inaxis_grouper_inplace(result) @@ -1053,7 +1061,7 @@ def _cython_agg_blocks( else: result = cast(DataFrame, result) # unwrap DataFrame to get array - if len(result._data.blocks) != 1: + if len(result._mgr.blocks) != 1: # We've split an object block! Everything we've assumed # about a single block input returning a single block output # is a lie. To keep the code-path for the typical non-split case @@ -1062,16 +1070,18 @@ def _cython_agg_blocks( split_frames.append(result) continue - assert len(result._data.blocks) == 1 - result = result._data.blocks[0].values + assert len(result._mgr.blocks) == 1 + result = result._mgr.blocks[0].values if isinstance(result, np.ndarray) and result.ndim == 1: result = result.reshape(1, -1) assert not isinstance(result, DataFrame) if result is not no_result: - # see if we can cast the block back to the original dtype - result = maybe_downcast_numeric(result, block.dtype) + # see if we can cast the block to the desired dtype + # this may not be the original dtype + dtype = maybe_cast_result_dtype(block.dtype, how) + result = maybe_downcast_numeric(result, dtype) if block.is_extension and isinstance(result, np.ndarray): # e.g. block.values was an IntegerArray @@ -1083,7 +1093,7 @@ def _cython_agg_blocks( result = type(block.values)._from_sequence( result.ravel(), dtype=block.values.dtype ) - except ValueError: + except (ValueError, TypeError): # reshape to be valid for non-Extension Block result = result.reshape(1, -1) @@ -1101,7 +1111,7 @@ def _cython_agg_blocks( assert len(locs) == result.shape[1] for i, loc in enumerate(locs): new_items.append(np.array([loc], dtype=locs.dtype)) - agg_blocks.append(result.iloc[:, [i]]._data.blocks[0]) + agg_blocks.append(result.iloc[:, [i]]._mgr.blocks[0]) # reset the locs in the blocks to correspond to our # current ordering @@ -1173,7 +1183,7 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: else: if cast: - result[item] = self._try_cast(result[item], data) + result[item] = maybe_cast_result(result[item], data) result_columns = obj.columns if cannot_agg: @@ -1187,20 +1197,14 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): key_names = self.grouper.names - # GH12824. - def first_not_none(values): - try: - return next(com.not_none(*values)) - except StopIteration: - return None - - v = first_not_none(values) + # GH12824 + first_not_none = next(com.not_none(*values), None) - if v is None: + if first_not_none is None: # GH9684. If all values are None, then this will throw an error. # We'd prefer it return an empty dataframe. return DataFrame() - elif isinstance(v, DataFrame): + elif isinstance(first_not_none, DataFrame): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) elif self.grouper.groupings is not None: if len(self.grouper.groupings) > 1: @@ -1217,6 +1221,9 @@ def first_not_none(values): # reorder the values values = [values[i] for i in indexer] + + # update due to the potential reorder + first_not_none = next(com.not_none(*values), None) else: key_index = Index(keys, name=key_names[0]) @@ -1226,20 +1233,19 @@ def first_not_none(values): key_index = None # make Nones an empty object - v = first_not_none(values) - if v is None: + if first_not_none is None: return DataFrame() - elif isinstance(v, NDFrame): + elif isinstance(first_not_none, NDFrame): # this is to silence a DeprecationWarning # TODO: Remove when default dtype of empty Series is object - kwargs = v._construct_axes_dict() - if v._constructor is Series: + kwargs = first_not_none._construct_axes_dict() + if first_not_none._constructor is Series: backup = create_series_with_explicit_dtype( **kwargs, dtype_if_empty=object ) else: - backup = v._constructor(**kwargs) + backup = first_not_none._constructor(**kwargs) values = [x if (x is not None) else backup for x in values] @@ -1340,14 +1346,10 @@ def first_not_none(values): # values are not series or array-like but scalars else: - # only coerce dates if we find at least 1 datetime - should_coerce = any(isinstance(x, Timestamp) for x in values) # self._selection_name not passed through to Series as the # result should not take the name of original selection # of columns - return Series(values, index=key_index)._convert( - datetime=True, coerce=should_coerce - ) + return Series(values, index=key_index) else: # Handle cases like BinGrouper @@ -1371,9 +1373,9 @@ def _transform_general(self, func, *args, **kwargs): path, res = self._choose_path(fast_path, slow_path, group) except TypeError: return self._transform_item_by_item(obj, fast_path) - except ValueError: + except ValueError as err: msg = "transform must return a scalar value for each group" - raise ValueError(msg) + raise ValueError(msg) from err else: res = path(group) @@ -1456,9 +1458,9 @@ def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame: for i, _ in enumerate(result.columns): res = algorithms.take_1d(result.iloc[:, i].values, ids) # TODO: we have no test cases that get here with EA dtypes; - # try_cast may not be needed if EAs never get here + # maybe_cast_result may not be needed if EAs never get here if cast: - res = self._try_cast(res, obj.iloc[:, i]) + res = maybe_cast_result(res, obj.iloc[:, i], how=func_nm) output.append(res) return DataFrame._from_arrays(output, columns=result.columns, index=obj.index) @@ -1645,9 +1647,9 @@ def _wrap_frame_output(self, result, obj) -> DataFrame: def _get_data_to_aggregate(self) -> BlockManager: obj = self._obj_with_exclusions if self.axis == 1: - return obj.T._data + return obj.T._mgr else: - return obj._data + return obj._mgr def _insert_inaxis_grouper_inplace(self, result): # zip in reverse so we can always insert at loc 0 @@ -1697,7 +1699,7 @@ def _wrap_aggregated_output( if self.axis == 1: result = result.T - return self._reindex_output(result)._convert(datetime=True) + return self._reindex_output(result) def _wrap_transformed_output( self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] @@ -1772,10 +1774,8 @@ def count(self): ids, _, ngroups = self.grouper.group_info mask = ids != -1 - vals = ( - (mask & ~_isna_ndarraylike(np.atleast_2d(blk.get_values()))) - for blk in data.blocks - ) + # TODO(2DEA): reshape would not be necessary with 2D EAs + vals = ((mask & ~isna(blk.values).reshape(blk.shape)) for blk in data.blocks) locs = (blk.mgr_locs for blk in data.blocks) counted = ( diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f946f0e63a583..873f24b9685e3 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -17,6 +17,7 @@ class providing the base-class of operations. Callable, Dict, FrozenSet, + Generic, Hashable, Iterable, List, @@ -24,6 +25,7 @@ class providing the base-class of operations. Optional, Tuple, Type, + TypeVar, Union, ) @@ -37,11 +39,12 @@ class providing the base-class of operations. from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError -from pandas.util._decorators import Appender, Substitution, cache_readonly +from pandas.util._decorators import Appender, Substitution, cache_readonly, doc -from pandas.core.dtypes.cast import maybe_downcast_to_dtype +from pandas.core.dtypes.cast import maybe_cast_result from pandas.core.dtypes.common import ( ensure_float, + is_bool_dtype, is_datetime64_dtype, is_extension_array_dtype, is_integer_dtype, @@ -53,7 +56,7 @@ class providing the base-class of operations. from pandas.core import nanops import pandas.core.algorithms as algorithms -from pandas.core.arrays import Categorical, DatetimeArray, try_cast_to_ea +from pandas.core.arrays import Categorical, DatetimeArray from pandas.core.base import DataError, PandasObject, SelectionMixin import pandas.core.common as com from pandas.core.frame import DataFrame @@ -199,20 +202,20 @@ class providing the base-class of operations. functions that expect Series, DataFrames, GroupBy or Resampler objects. Instead of writing ->>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c) +>>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c) # doctest: +SKIP You can write >>> (df.groupby('group') ... .pipe(f) ... .pipe(g, arg1=a) -... .pipe(h, arg2=b, arg3=c)) +... .pipe(h, arg2=b, arg3=c)) # doctest: +SKIP which is much more readable. Parameters ---------- -func : callable or tuple of (callable, string) +func : callable or tuple of (callable, str) Function to apply to this %(klass)s object or, alternatively, a `(callable, data_keyword)` tuple where `data_keyword` is a string indicating the keyword of `callable` that expects the @@ -354,13 +357,13 @@ def _group_selection_context(groupby): ] -class _GroupBy(PandasObject, SelectionMixin): +class _GroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): _group_selection = None _apply_whitelist: FrozenSet[str] = frozenset() def __init__( self, - obj: NDFrame, + obj: FrameOrSeries, keys: Optional[_KeysArgType] = None, axis: int = 0, level=None, @@ -482,13 +485,13 @@ def get_converter(s): try: # If the original grouper was a tuple return [self.indices[name] for name in names] - except KeyError: + except KeyError as err: # turns out it wasn't a tuple msg = ( "must supply a same-length tuple to get_group " "with multiple grouping keys" ) - raise ValueError(msg) + raise ValueError(msg) from err converters = [get_converter(s) for s in index_sample] names = (tuple(f(n) for f, n in zip(converters, name)) for name in names) @@ -792,36 +795,6 @@ def _cumcount_array(self, ascending: bool = True): rev[sorter] = np.arange(count, dtype=np.intp) return out[rev].astype(np.int64, copy=False) - def _try_cast(self, result, obj, numeric_only: bool = False): - """ - Try to cast the result to our obj original type, - we may have roundtripped through object in the mean-time. - - If numeric_only is True, then only try to cast numerics - and not datetimelikes. - - """ - if obj.ndim > 1: - dtype = obj._values.dtype - else: - dtype = obj.dtype - - if not is_scalar(result): - if is_extension_array_dtype(dtype) and dtype.kind != "M": - # The function can return something of any type, so check - # if the type is compatible with the calling EA. - # datetime64tz is handled correctly in agg_series, - # so is excluded here. - - if len(result) and isinstance(result[0], dtype.type): - cls = dtype.construct_array_type() - result = try_cast_to_ea(cls, result, dtype=dtype) - - elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: - result = maybe_downcast_to_dtype(result, dtype) - - return result - def _transform_should_cast(self, func_nm: str) -> bool: """ Parameters @@ -852,7 +825,7 @@ def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): continue if self._transform_should_cast(how): - result = self._try_cast(result, obj) + result = maybe_cast_result(result, obj, how=how) key = base.OutputKey(label=name, position=idx) output[key] = result @@ -895,12 +868,12 @@ def _cython_agg_general( assert len(agg_names) == result.shape[1] for result_column, result_name in zip(result.T, agg_names): key = base.OutputKey(label=result_name, position=idx) - output[key] = self._try_cast(result_column, obj) + output[key] = maybe_cast_result(result_column, obj, how=how) idx += 1 else: assert result.ndim == 1 key = base.OutputKey(label=name, position=idx) - output[key] = self._try_cast(result, obj) + output[key] = maybe_cast_result(result, obj, how=how) idx += 1 if len(output) == 0: @@ -929,7 +902,7 @@ def _python_agg_general(self, func, *args, **kwargs): assert result is not None key = base.OutputKey(label=name, position=idx) - output[key] = self._try_cast(result, obj, numeric_only=True) + output[key] = maybe_cast_result(result, obj, numeric_only=True) if len(output) == 0: return self._python_apply_general(f) @@ -944,7 +917,7 @@ def _python_agg_general(self, func, *args, **kwargs): if is_numeric_dtype(values.dtype): values = ensure_float(values) - output[key] = self._try_cast(values[mask], result) + output[key] = maybe_cast_result(values[mask], result) return self._wrap_aggregated_output(output) @@ -1026,7 +999,11 @@ def _apply_filter(self, indices, dropna): return filtered -class GroupBy(_GroupBy): +# To track operations that expand dimensions, like ohlc +OutputFrameOrSeries = TypeVar("OutputFrameOrSeries", bound=NDFrame) + + +class GroupBy(_GroupBy[FrameOrSeries]): """ Class for grouping and aggregating relational data. @@ -1451,7 +1428,7 @@ def ohlc(self) -> DataFrame: """ return self._apply_to_column_groupbys(lambda x: x._cython_agg_general("ohlc")) - @Appender(DataFrame.describe.__doc__) + @doc(DataFrame.describe) def describe(self, **kwargs): with _group_selection_context(self): result = self.apply(lambda x: x.describe(**kwargs)) @@ -1892,11 +1869,15 @@ def pre_processor(vals: np.ndarray) -> Tuple[np.ndarray, Optional[Type]]: ) inference = None - if is_integer_dtype(vals): + if is_integer_dtype(vals.dtype): + if is_extension_array_dtype(vals.dtype): + vals = vals.to_numpy(dtype=float, na_value=np.nan) inference = np.int64 - elif is_datetime64_dtype(vals): + elif is_bool_dtype(vals.dtype) and is_extension_array_dtype(vals.dtype): + vals = vals.to_numpy(dtype=float, na_value=np.nan) + elif is_datetime64_dtype(vals.dtype): inference = "datetime64[ns]" - vals = vals.astype(np.float) + vals = np.asarray(vals).astype(np.float) return vals, inference @@ -2036,7 +2017,9 @@ def cumcount(self, ascending: bool = True): Essentially this is equivalent to - >>> self.apply(lambda x: pd.Series(np.arange(len(x)), x.index)) + .. code-block:: python + + self.apply(lambda x: pd.Series(np.arange(len(x)), x.index)) Parameters ---------- @@ -2271,7 +2254,7 @@ def _get_cythonized_result( for idx, obj in enumerate(self._iterate_slices()): name = obj.name - values = obj._data._values + values = obj._values if aggregate: result_sz = ngroups @@ -2312,11 +2295,12 @@ def _get_cythonized_result( return self._wrap_transformed_output(output) @Substitution(name="groupby") - @Appender(_common_see_also) def shift(self, periods=1, freq=None, axis=0, fill_value=None): """ Shift each group by periods observations. + If freq is passed, the index will be increased using the periods and the freq. + Parameters ---------- periods : int, default 1 @@ -2324,7 +2308,9 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): freq : str, optional Frequency string. axis : axis to shift, default 0 + Shift direction. fill_value : optional + The scalar value to use for newly introduced missing values. .. versionadded:: 0.24.0 @@ -2332,6 +2318,12 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): ------- Series or DataFrame Object shifted within each group. + + See Also + -------- + Index.shift : Shift values of Index. + tshift : Shift the time index, using the index’s frequency + if available. """ if freq is not None or axis != 0 or not isna(fill_value): return self.apply(lambda x: x.shift(periods, freq, axis, fill_value)) @@ -2442,8 +2434,8 @@ def tail(self, n=5): return self._selected_obj[mask] def _reindex_output( - self, output: FrameOrSeries, fill_value: Scalar = np.NaN - ) -> FrameOrSeries: + self, output: OutputFrameOrSeries, fill_value: Scalar = np.NaN + ) -> OutputFrameOrSeries: """ If we have categorical groupers, then we might want to make sure that we have a fully re-indexed output to the levels. This means expanding @@ -2531,7 +2523,7 @@ def _reindex_output( GroupBy._add_numeric_operations() -@Appender(GroupBy.__doc__) +@doc(GroupBy) def get_groupby( obj: NDFrame, by: Optional[_KeysArgType] = None, diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 21e171f937de8..9bd098d1d49a3 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -79,16 +79,51 @@ class Grouper: -------- Syntactic sugar for ``df.groupby('A')`` - >>> df.groupby(Grouper(key='A')) - - Specify a resample operation on the column 'date' - - >>> df.groupby(Grouper(key='date', freq='60s')) - - Specify a resample operation on the level 'date' on the columns axis - with a frequency of 60s - - >>> df.groupby(Grouper(level='date', freq='60s', axis=1)) + >>> df = pd.DataFrame( + ... { + ... "Animal": ["Falcon", "Parrot", "Falcon", "Falcon", "Parrot"], + ... "Speed": [100, 5, 200, 300, 15], + ... } + ... ) + >>> df + Animal Speed + 0 Falcon 100 + 1 Parrot 5 + 2 Falcon 200 + 3 Falcon 300 + 4 Parrot 15 + >>> df.groupby(pd.Grouper(key="Animal")).mean() + Speed + Animal + Falcon 200 + Parrot 10 + + Specify a resample operation on the column 'Publish date' + + >>> df = pd.DataFrame( + ... { + ... "Publish date": [ + ... pd.Timestamp("2000-01-02"), + ... pd.Timestamp("2000-01-02"), + ... pd.Timestamp("2000-01-09"), + ... pd.Timestamp("2000-01-16") + ... ], + ... "ID": [0, 1, 2, 3], + ... "Price": [10, 20, 30, 40] + ... } + ... ) + >>> df + Publish date ID Price + 0 2000-01-02 0 10 + 1 2000-01-02 1 20 + 2 2000-01-09 2 30 + 3 2000-01-16 3 40 + >>> df.groupby(pd.Grouper(key="Publish date", freq="1W")).mean() + ID Price + Publish date + 2000-01-02 0.5 15.0 + 2000-01-09 2.0 30.0 + 2000-01-16 3.0 40.0 """ _attributes: Tuple[str, ...] = ("key", "level", "freq", "axis", "sort") @@ -561,7 +596,8 @@ def get_grouper( # if the actual grouper should be obj[key] def is_in_axis(key) -> bool: if not _is_label_like(key): - items = obj._data.items + # items -> .columns for DataFrame, .index for Series + items = obj.axes[-1] try: items.get_loc(key) except (KeyError, TypeError, InvalidIndexError): diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7259268ac3f2b..8d535374a083f 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -217,7 +217,7 @@ def indices(self): return self.groupings[0].indices else: codes_list = [ping.codes for ping in self.groupings] - keys = [com.values_from_object(ping.group_index) for ping in self.groupings] + keys = [ping.group_index for ping in self.groupings] return get_indexer_dict(codes_list, keys) @property @@ -525,9 +525,7 @@ def _cython_operation( np.empty(out_shape, dtype=out_dtype), fill_value=np.nan ) counts = np.zeros(self.ngroups, dtype=np.int64) - result = self._aggregate( - result, counts, values, codes, func, is_datetimelike, min_count - ) + result = self._aggregate(result, counts, values, codes, func, min_count) elif kind == "transform": result = _maybe_fill( np.empty_like(values, dtype=out_dtype), fill_value=np.nan @@ -590,14 +588,7 @@ def transform(self, values, how: str, axis: int = 0, **kwargs): return self._cython_operation("transform", values, how, axis, **kwargs) def _aggregate( - self, - result, - counts, - values, - comp_ids, - agg_func, - is_datetimelike: bool, - min_count: int = -1, + self, result, counts, values, comp_ids, agg_func, min_count: int = -1, ): if agg_func is libgroupby.group_nth: # different signature from the others @@ -691,7 +682,7 @@ def _aggregate_series_pure_python(self, obj: Series, func): assert result is not None result = lib.maybe_convert_objects(result, try_float=0) - # TODO: try_cast back to EA? + # TODO: maybe_cast_to_extension_array? return result, counts diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 5e53b061dd1c8..3d0e3699264a8 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -11,6 +11,7 @@ is_array_like, is_bool_dtype, is_extension_array_dtype, + is_integer, is_integer_dtype, is_list_like, ) @@ -20,6 +21,34 @@ # Indexer Identification +def is_valid_positional_slice(slc: slice) -> bool: + """ + Check if a slice object can be interpreted as a positional indexer. + + Parameters + ---------- + slc : slice + + Returns + ------- + bool + + Notes + ----- + A valid positional slice may also be interpreted as a label-based slice + depending on the index being sliced. + """ + + def is_int_or_none(val): + return val is None or is_integer(val) + + return ( + is_int_or_none(slc.start) + and is_int_or_none(slc.stop) + and is_int_or_none(slc.step) + ) + + def is_list_like_indexer(key) -> bool: """ Check if we have a list-like indexer that is *not* a NamedTuple. @@ -36,18 +65,26 @@ def is_list_like_indexer(key) -> bool: return is_list_like(key) and not (isinstance(key, tuple) and type(key) is not tuple) -def is_scalar_indexer(indexer, arr_value) -> bool: +def is_scalar_indexer(indexer, ndim: int) -> bool: """ Return True if we are all scalar indexers. + Parameters + ---------- + indexer : object + ndim : int + Number of dimensions in the object being indexed. + Returns ------- bool """ - if arr_value.ndim == 1: - if not isinstance(indexer, tuple): - indexer = tuple([indexer]) - return any(isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer) + if isinstance(indexer, tuple): + if len(indexer) == ndim: + return all( + is_integer(x) or (isinstance(x, np.ndarray) and x.ndim == len(x) == 1) + for x in indexer + ) return False @@ -437,10 +474,10 @@ def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any: elif is_integer_dtype(dtype): try: indexer = np.asarray(indexer, dtype=np.intp) - except ValueError: + except ValueError as err: raise ValueError( "Cannot index with an integer indexer containing NA values" - ) + ) from err else: raise IndexError("arrays used as indices must be of integer or boolean type") diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index db774a03c02f8..d44fed9e097e7 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -1,16 +1,17 @@ """ datetimelike delegation """ +from typing import TYPE_CHECKING + import numpy as np from pandas.core.dtypes.common import ( is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_datetime_arraylike, is_integer_dtype, is_list_like, - is_period_arraylike, + is_period_dtype, is_timedelta64_dtype, ) from pandas.core.dtypes.generic import ABCSeries @@ -21,9 +22,12 @@ from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex +if TYPE_CHECKING: + from pandas import Series # noqa:F401 + class Properties(PandasDelegate, PandasObject, NoNewAttributesMixin): - def __init__(self, data, orig): + def __init__(self, data: "Series", orig): if not isinstance(data, ABCSeries): raise TypeError( f"cannot convert an object of type {type(data)} to a datetimelike index" @@ -45,12 +49,8 @@ def _get_values(self): elif is_timedelta64_dtype(data.dtype): return TimedeltaIndex(data, copy=False, name=self.name) - else: - if is_period_arraylike(data): - # TODO: use to_period_array - return PeriodArray(data, copy=False) - if is_datetime_arraylike(data): - return DatetimeIndex(data, copy=False, name=self.name) + elif is_period_dtype(data): + return PeriodArray(data, copy=False) raise TypeError( f"cannot convert an object of type {type(data)} to a datetimelike index" @@ -129,15 +129,47 @@ class DatetimeProperties(Properties): Examples -------- - >>> s.dt.hour - >>> s.dt.second - >>> s.dt.quarter + >>> seconds_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="s")) + >>> seconds_series + 0 2000-01-01 00:00:00 + 1 2000-01-01 00:00:01 + 2 2000-01-01 00:00:02 + dtype: datetime64[ns] + >>> seconds_series.dt.second + 0 0 + 1 1 + 2 2 + dtype: int64 + + >>> hours_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="h")) + >>> hours_series + 0 2000-01-01 00:00:00 + 1 2000-01-01 01:00:00 + 2 2000-01-01 02:00:00 + dtype: datetime64[ns] + >>> hours_series.dt.hour + 0 0 + 1 1 + 2 2 + dtype: int64 + + >>> quarters_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="q")) + >>> quarters_series + 0 2000-03-31 + 1 2000-06-30 + 2 2000-09-30 + dtype: datetime64[ns] + >>> quarters_series.dt.quarter + 0 1 + 1 2 + 2 3 + dtype: int64 Returns a Series indexed like the original Series. Raises TypeError if the Series does not contain datetimelike values. """ - def to_pydatetime(self): + def to_pydatetime(self) -> np.ndarray: """ Return the data as an array of native Python datetime objects. @@ -187,6 +219,38 @@ def to_pydatetime(self): def freq(self): return self._get_values().inferred_freq + @property + def isocalendar(self): + """ + Returns a DataFrame with the year, week, and day calculated according to + the ISO 8601 standard. + + .. versionadded:: 1.1.0 + + Returns + ------- + DataFrame + with columns year, week and day + + See Also + -------- + Timestamp.isocalendar + datetime.date.isocalendar + + Examples + -------- + >>> ser = pd.to_datetime(pd.Series(["2010-01-01", pd.NaT])) + >>> ser.dt.isocalendar + year week day + 0 2009 53 5 + 1 + >>> ser.dt.isocalendar.week + 0 53 + 1 + Name: week, dtype: UInt32 + """ + return self._get_values().isocalendar.set_index(self._parent.index) + @delegate_names( delegate=TimedeltaArray, accessors=TimedeltaArray._datetimelike_ops, typ="property" @@ -200,16 +264,27 @@ class TimedeltaProperties(Properties): """ Accessor object for datetimelike properties of the Series values. - Examples - -------- - >>> s.dt.hours - >>> s.dt.seconds - Returns a Series indexed like the original Series. Raises TypeError if the Series does not contain datetimelike values. + + Examples + -------- + >>> seconds_series = pd.Series( + ... pd.timedelta_range(start="1 second", periods=3, freq="S") + ... ) + >>> seconds_series + 0 0 days 00:00:01 + 1 0 days 00:00:02 + 2 0 days 00:00:03 + dtype: timedelta64[ns] + >>> seconds_series.dt.seconds + 0 1 + 1 2 + 2 3 + dtype: int64 """ - def to_pytimedelta(self): + def to_pytimedelta(self) -> np.ndarray: """ Return an array of native `datetime.timedelta` objects. @@ -229,7 +304,7 @@ def to_pytimedelta(self): Examples -------- - >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='d')) + >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="d")) >>> s 0 0 days 1 1 days @@ -239,9 +314,9 @@ def to_pytimedelta(self): dtype: timedelta64[ns] >>> s.dt.to_pytimedelta() - array([datetime.timedelta(0), datetime.timedelta(1), - datetime.timedelta(2), datetime.timedelta(3), - datetime.timedelta(4)], dtype=object) + array([datetime.timedelta(0), datetime.timedelta(days=1), + datetime.timedelta(days=2), datetime.timedelta(days=3), + datetime.timedelta(days=4)], dtype=object) """ return self._get_values().to_pytimedelta() @@ -258,11 +333,11 @@ def components(self): -------- >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='s')) >>> s - 0 00:00:00 - 1 00:00:01 - 2 00:00:02 - 3 00:00:03 - 4 00:00:04 + 0 0 days 00:00:00 + 1 0 days 00:00:01 + 2 0 days 00:00:02 + 3 0 days 00:00:03 + 4 0 days 00:00:04 dtype: timedelta64[ns] >>> s.dt.components days hours minutes seconds milliseconds microseconds nanoseconds @@ -271,7 +346,7 @@ def components(self): 2 0 0 0 2 0 0 0 3 0 0 0 3 0 0 0 4 0 0 0 4 0 0 0 - """ # noqa: E501 + """ return self._get_values().components.set_index(self._parent.index) @property @@ -289,21 +364,67 @@ class PeriodProperties(Properties): """ Accessor object for datetimelike properties of the Series values. - Examples - -------- - >>> s.dt.hour - >>> s.dt.second - >>> s.dt.quarter - Returns a Series indexed like the original Series. Raises TypeError if the Series does not contain datetimelike values. + + Examples + -------- + >>> seconds_series = pd.Series( + ... pd.period_range( + ... start="2000-01-01 00:00:00", end="2000-01-01 00:00:03", freq="s" + ... ) + ... ) + >>> seconds_series + 0 2000-01-01 00:00:00 + 1 2000-01-01 00:00:01 + 2 2000-01-01 00:00:02 + 3 2000-01-01 00:00:03 + dtype: period[S] + >>> seconds_series.dt.second + 0 0 + 1 1 + 2 2 + 3 3 + dtype: int64 + + >>> hours_series = pd.Series( + ... pd.period_range(start="2000-01-01 00:00", end="2000-01-01 03:00", freq="h") + ... ) + >>> hours_series + 0 2000-01-01 00:00 + 1 2000-01-01 01:00 + 2 2000-01-01 02:00 + 3 2000-01-01 03:00 + dtype: period[H] + >>> hours_series.dt.hour + 0 0 + 1 1 + 2 2 + 3 3 + dtype: int64 + + >>> quarters_series = pd.Series( + ... pd.period_range(start="2000-01-01", end="2000-12-31", freq="Q-DEC") + ... ) + >>> quarters_series + 0 2000Q1 + 1 2000Q2 + 2 2000Q3 + 3 2000Q4 + dtype: period[Q-DEC] + >>> quarters_series.dt.quarter + 0 1 + 1 2 + 2 3 + 3 4 + dtype: int64 """ class CombinedDatetimelikeProperties( DatetimeProperties, TimedeltaProperties, PeriodProperties ): - def __new__(cls, data): + def __new__(cls, data: "Series"): # CombinedDatetimelikeProperties isn't really instantiated. Instead # we need to choose which parent (datetime or timedelta) is # appropriate. Since we're checking the dtypes anyway, we'll just @@ -321,7 +442,7 @@ def __new__(cls, data): orig.array, name=orig.name, copy=False, - dtype=orig.values.categories.dtype, + dtype=orig._values.categories.dtype, ) if is_datetime64_dtype(data.dtype): @@ -330,9 +451,7 @@ def __new__(cls, data): return DatetimeProperties(data, orig) elif is_timedelta64_dtype(data.dtype): return TimedeltaProperties(data, orig) - elif is_period_arraylike(data): + elif is_period_dtype(data): return PeriodProperties(data, orig) - elif is_datetime_arraylike(data): - return DatetimeProperties(data, orig) raise AttributeError("Can only use .dt accessor with datetimelike values") diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c215fdb475ed8..530aaee24c7fb 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1,7 +1,8 @@ +from copy import copy as copy_func from datetime import datetime import operator from textwrap import dedent -from typing import TYPE_CHECKING, Any, FrozenSet, Hashable, Optional, Union +from typing import TYPE_CHECKING, Any, FrozenSet, Hashable, Union import warnings import numpy as np @@ -15,7 +16,7 @@ from pandas._typing import Label from pandas.compat import set_function_name from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender, Substitution, cache_readonly +from pandas.util._decorators import Appender, Substitution, cache_readonly, doc from pandas.core.dtypes import concat as _concat from pandas.core.dtypes.cast import ( @@ -29,10 +30,8 @@ ensure_platform_int, is_bool, is_bool_dtype, - is_categorical, is_categorical_dtype, is_datetime64_any_dtype, - is_datetime64tz_dtype, is_dtype_equal, is_extension_array_dtype, is_float, @@ -49,6 +48,7 @@ is_signed_integer_dtype, is_timedelta64_dtype, is_unsigned_integer_dtype, + pandas_dtype, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ( @@ -69,6 +69,7 @@ from pandas.core.accessor import CachedAccessor import pandas.core.algorithms as algos from pandas.core.arrays import ExtensionArray +from pandas.core.arrays.datetimes import tz_to_dtype, validate_tz_from_dtype from pandas.core.base import IndexOpsMixin, PandasObject import pandas.core.common as com from pandas.core.indexers import deprecate_ndim_indexing @@ -205,10 +206,14 @@ class Index(IndexOpsMixin, PandasObject): -------- RangeIndex : Index implementing a monotonic integer range. CategoricalIndex : Index of :class:`Categorical` s. - MultiIndex : A multi-level, or hierarchical, Index. + MultiIndex : A multi-level, or hierarchical Index. IntervalIndex : An Index of :class:`Interval` s. - DatetimeIndex, TimedeltaIndex, PeriodIndex - Int64Index, UInt64Index, Float64Index + DatetimeIndex : Index of datetime64 data. + TimedeltaIndex : Index of timedelta64 data. + PeriodIndex : Index of Period data. + Int64Index : A special case of :class:`Index` with purely integer labels. + UInt64Index : A special case of :class:`Index` with purely unsigned integer labels. + Float64Index : A special case of :class:`Index` with purely float labels. Notes ----- @@ -289,10 +294,19 @@ def __new__( name = maybe_extract_name(name, data, cls) + if dtype is not None: + dtype = pandas_dtype(dtype) + if "tz" in kwargs: + tz = kwargs.pop("tz") + validate_tz_from_dtype(dtype, tz) + dtype = tz_to_dtype(tz) + if isinstance(data, ABCPandasArray): # ensure users don't accidentally put a PandasArray in an index. data = data.to_numpy() + data_dtype = getattr(data, "dtype", None) + # range if isinstance(data, RangeIndex): return RangeIndex(start=data, copy=copy, dtype=dtype, name=name) @@ -300,43 +314,39 @@ def __new__( return RangeIndex.from_range(data, dtype=dtype, name=name) # categorical - elif is_categorical_dtype(data) or is_categorical_dtype(dtype): + elif is_categorical_dtype(data_dtype) or is_categorical_dtype(dtype): # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 from pandas.core.indexes.category import CategoricalIndex return _maybe_asobject(dtype, CategoricalIndex, data, copy, name, **kwargs) # interval - elif is_interval_dtype(data) or is_interval_dtype(dtype): + elif is_interval_dtype(data_dtype) or is_interval_dtype(dtype): # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 from pandas.core.indexes.interval import IntervalIndex return _maybe_asobject(dtype, IntervalIndex, data, copy, name, **kwargs) - elif ( - is_datetime64_any_dtype(data) - or is_datetime64_any_dtype(dtype) - or "tz" in kwargs - ): + elif is_datetime64_any_dtype(data_dtype) or is_datetime64_any_dtype(dtype): # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 from pandas import DatetimeIndex return _maybe_asobject(dtype, DatetimeIndex, data, copy, name, **kwargs) - elif is_timedelta64_dtype(data) or is_timedelta64_dtype(dtype): + elif is_timedelta64_dtype(data_dtype) or is_timedelta64_dtype(dtype): # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 from pandas import TimedeltaIndex return _maybe_asobject(dtype, TimedeltaIndex, data, copy, name, **kwargs) - elif is_period_dtype(data) or is_period_dtype(dtype): + elif is_period_dtype(data_dtype) or is_period_dtype(dtype): # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 from pandas import PeriodIndex return _maybe_asobject(dtype, PeriodIndex, data, copy, name, **kwargs) # extension dtype - elif is_extension_array_dtype(data) or is_extension_array_dtype(dtype): + elif is_extension_array_dtype(data_dtype) or is_extension_array_dtype(dtype): if not (dtype is None or is_object_dtype(dtype)): # coerce to the provided dtype ea_cls = dtype.construct_array_type() @@ -397,10 +407,10 @@ def __new__( raise ValueError("Index data must be 1-dimensional") return cls._simple_new(subarr, name) - elif hasattr(data, "__array__"): - return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs) elif data is None or is_scalar(data): raise cls._scalar_data_error(data) + elif hasattr(data, "__array__"): + return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs) else: if tupleize_cols and is_list_like(data): # GH21470: convert iterable to list before determining if empty @@ -465,10 +475,10 @@ def _simple_new(cls, values, name: Label = None): # _index_data is a (temporary?) fix to ensure that the direct data # manipulation we do in `_libs/reduction.pyx` continues to work. # We need access to the actual ndarray, since we're messing with - # data buffers and strides. We don't re-use `_ndarray_values`, since - # we actually set this value too. + # data buffers and strides. result._index_data = values result._name = name + result._cache = {} return result._reset_identity() @@ -499,11 +509,13 @@ def _shallow_copy(self, values=None, name: Label = no_default): name : Label, defaults to self.name """ name = self.name if name is no_default else name - + cache = self._cache.copy() if values is None else {} if values is None: values = self.values - return self._simple_new(values, name=name) + result = self._simple_new(values, name=name) + result._cache = cache + return result def _shallow_copy_with_infer(self, values, **kwargs): """ @@ -529,11 +541,10 @@ def _shallow_copy_with_infer(self, values, **kwargs): return self._constructor(values, **attributes) except (TypeError, ValueError): pass - return Index(values, **attributes) - def _update_inplace(self, result, **kwargs): - # guard when called from IndexOpsMixin - raise TypeError("Index can't be updated inplace") + # Remove tz so Index will try non-DatetimeIndex inference + attributes.pop("tz", None) + return Index(values, **attributes) def is_(self, other) -> bool: """ @@ -568,10 +579,10 @@ def _cleanup(self): def _engine(self): # property, for now, slow to look up - # to avoid a reference cycle, bind `_ndarray_values` to a local variable, so + # to avoid a reference cycle, bind `target_values` to a local variable, so # `self` is not passed into the lambda. - _ndarray_values = self._ndarray_values - return self._engine_type(lambda: _ndarray_values, len(self)) + target_values = self._get_engine_target() + return self._engine_type(lambda: target_values, len(self)) # -------------------------------------------------------------------- # Array-Like Methods @@ -620,7 +631,8 @@ def ravel(self, order="C"): -------- numpy.ndarray.ravel """ - return self._ndarray_values.ravel(order=order) + values = self._get_engine_target() + return values.ravel(order=order) def view(self, cls=None): @@ -666,12 +678,14 @@ def astype(self, dtype, copy=True): return CategoricalIndex(self.values, name=self.name, dtype=dtype, copy=copy) elif is_extension_array_dtype(dtype): - return Index(np.asarray(self), dtype=dtype, copy=copy) + return Index(np.asarray(self), name=self.name, dtype=dtype, copy=copy) try: casted = self.values.astype(dtype, copy=copy) - except (TypeError, ValueError): - raise TypeError(f"Cannot cast {type(self).__name__} to dtype {dtype}") + except (TypeError, ValueError) as err: + raise TypeError( + f"Cannot cast {type(self).__name__} to dtype {dtype}" + ) from err return Index(casted, name=self.name, dtype=dtype) _index_shared_docs[ @@ -796,20 +810,24 @@ def repeat(self, repeats, axis=None): def copy(self, name=None, deep=False, dtype=None, names=None): """ - Make a copy of this object. Name and dtype sets those attributes on - the new object. + Make a copy of this object. + + Name and dtype sets those attributes on the new object. Parameters ---------- - name : Label + name : Label, optional + Set name for new object. deep : bool, default False dtype : numpy dtype or pandas type, optional + Set dtype for new object. names : list-like, optional Kept for compatibility with MultiIndex. Should not be used. Returns ------- Index + Index refer to new object which is a copy of this object. Notes ----- @@ -1831,7 +1849,7 @@ def is_object(self) -> bool: >>> idx = pd.Index(["Watermelon", "Orange", "Apple", ... "Watermelon"]).astype("category") - >>> idx.object() + >>> idx.is_object() False >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0]) @@ -1945,6 +1963,12 @@ def is_mixed(self) -> bool: >>> idx.is_mixed() False """ + warnings.warn( + "Index.is_mixed is deprecated and will be removed in a future version. " + "Check index.inferred_type directly instead.", + FutureWarning, + stacklevel=2, + ) return self.inferred_type in ["mixed"] def holds_integer(self) -> bool: @@ -2043,7 +2067,7 @@ def isna(self): >>> idx Float64Index([5.2, 6.0, nan], dtype='float64') >>> idx.isna() - array([False, False, True], dtype=bool) + array([False, False, True]) Empty strings are not considered NA values. None is considered an NA value. @@ -2052,7 +2076,7 @@ def isna(self): >>> idx Index(['black', '', 'red', None], dtype='object') >>> idx.isna() - array([False, False, False, True], dtype=bool) + array([False, False, False, True]) For datetimes, `NaT` (Not a Time) is considered as an NA value. @@ -2062,7 +2086,7 @@ def isna(self): DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], dtype='datetime64[ns]', freq=None) >>> idx.isna() - array([False, True, True, True], dtype=bool) + array([False, True, True, True]) """ return self._isnan @@ -2124,13 +2148,18 @@ def fillna(self, value=None, downcast=None): Scalar value to use to fill holes (e.g. 0). This value cannot be a list-likes. downcast : dict, default is None - a dict of item->dtype of what to downcast if possible, + A dict of item->dtype of what to downcast if possible, or the string 'infer' which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible). Returns ------- - filled : Index + Index + + See Also + -------- + DataFrame.fillna : Fill NaN values of a DataFrame. + Series.fillna : Fill NaN Values of a Series. """ self._assert_can_do_op(value) if self.hasnans: @@ -2153,7 +2182,7 @@ def dropna(self, how="any"): Returns ------- - valid : Index + Index """ if how not in ("any", "all"): raise ValueError(f"invalid how option: {how}") @@ -2519,14 +2548,8 @@ def _union(self, other, sort): return other._get_reconciled_name_object(self) # TODO(EA): setops-refactor, clean all this up - if is_datetime64tz_dtype(self): - lvals = self._ndarray_values - else: - lvals = self._values - if is_datetime64tz_dtype(other): - rvals = other._ndarray_values - else: - rvals = other._values + lvals = self._values + rvals = other._values if sort is None and self.is_monotonic and other.is_monotonic: try: @@ -2827,8 +2850,6 @@ def get_loc(self, key, method=None, tolerance=None): the index at the matching location most satisfy the equation ``abs(index[loc] - key) <= tolerance``. - .. versionadded:: 0.21.0 (list-like tolerance) - Returns ------- loc : int if unique index, slice if monotonic index, else mask @@ -2845,7 +2866,7 @@ def get_loc(self, key, method=None, tolerance=None): >>> non_monotonic_index = pd.Index(list('abcb')) >>> non_monotonic_index.get_loc('b') - array([False, True, False, True], dtype=bool) + array([False, True, False, True]) """ if method is None: if tolerance is not None: @@ -2856,8 +2877,8 @@ def get_loc(self, key, method=None, tolerance=None): casted_key = self._maybe_cast_indexer(key) try: return self._engine.get_loc(casted_key) - except KeyError: - raise KeyError(key) + except KeyError as err: + raise KeyError(key) from err if tolerance is not None: tolerance = self._convert_tolerance(tolerance, np.asarray(key)) @@ -2900,8 +2921,6 @@ def get_loc(self, key, method=None, tolerance=None): the same size as the index and its dtype must exactly match the index's type. - .. versionadded:: 0.21.0 (list-like tolerance) - Returns ------- indexer : ndarray of int @@ -2968,7 +2987,7 @@ def get_indexer( "backfill or nearest reindexing" ) - indexer = self._engine.get_indexer(target._ndarray_values) + indexer = self._engine.get_indexer(target._get_engine_target()) return ensure_platform_int(indexer) @@ -2982,19 +3001,20 @@ def _convert_tolerance(self, tolerance, target): def _get_fill_indexer( self, target: "Index", method: str_t, limit=None, tolerance=None ) -> np.ndarray: + + target_values = target._get_engine_target() + if self.is_monotonic_increasing and target.is_monotonic_increasing: engine_method = ( self._engine.get_pad_indexer if method == "pad" else self._engine.get_backfill_indexer ) - indexer = engine_method(target._ndarray_values, limit) + indexer = engine_method(target_values, limit) else: indexer = self._get_fill_indexer_searchsorted(target, method, limit) if tolerance is not None: - indexer = self._filter_indexer_tolerance( - target._ndarray_values, indexer, tolerance - ) + indexer = self._filter_indexer_tolerance(target_values, indexer, tolerance) return indexer def _get_fill_indexer_searchsorted( @@ -3039,8 +3059,9 @@ def _get_nearest_indexer(self, target: "Index", limit, tolerance) -> np.ndarray: left_indexer = self.get_indexer(target, "pad", limit=limit) right_indexer = self.get_indexer(target, "backfill", limit=limit) - left_distances = np.abs(self[left_indexer] - target) - right_distances = np.abs(self[right_indexer] - target) + target_values = target._values + left_distances = np.abs(self._values[left_indexer] - target_values) + right_distances = np.abs(self._values[right_indexer] - target_values) op = operator.lt if self.is_monotonic_increasing else operator.le indexer = np.where( @@ -3049,13 +3070,16 @@ def _get_nearest_indexer(self, target: "Index", limit, tolerance) -> np.ndarray: right_indexer, ) if tolerance is not None: - indexer = self._filter_indexer_tolerance(target, indexer, tolerance) + indexer = self._filter_indexer_tolerance(target_values, indexer, tolerance) return indexer def _filter_indexer_tolerance( - self, target: "Index", indexer: np.ndarray, tolerance + self, + target: Union["Index", np.ndarray, ExtensionArray], + indexer: np.ndarray, + tolerance, ) -> np.ndarray: - distance = abs(self.values[indexer] - target) + distance = abs(self._values[indexer] - target) indexer = np.where(distance <= tolerance, indexer, -1) return indexer @@ -3072,48 +3096,6 @@ def _get_partial_string_timestamp_match_key(self, key): # GH#10331 return key - def _convert_scalar_indexer(self, key, kind: str_t): - """ - Convert a scalar indexer. - - Parameters - ---------- - key : label of the slice bound - kind : {'loc', 'getitem'} - """ - assert kind in ["loc", "getitem"] - - if len(self) and not isinstance(self, ABCMultiIndex): - - # we can raise here if we are definitive that this - # is positional indexing (eg. .loc on with a float) - # or label indexing if we are using a type able - # to be represented in the index - - if kind == "getitem" and is_float(key): - if not self.is_floating(): - self._invalid_indexer("label", key) - - elif kind == "loc" and is_float(key): - - # we want to raise KeyError on string/mixed here - # technically we *could* raise a TypeError - # on anything but mixed though - if self.inferred_type not in [ - "floating", - "mixed-integer-float", - "integer-na", - "string", - "mixed", - ]: - self._invalid_indexer("label", key) - - elif kind == "loc" and is_integer(key): - if not (is_integer_dtype(self.dtype) or is_object_dtype(self.dtype)): - self._invalid_indexer("label", key) - - return key - def _validate_positional_slice(self, key: slice): """ For positional indexing, a slice must have either int or None @@ -3163,7 +3145,7 @@ def is_int(v): # convert the slice to an indexer here # if we are mixed and have integers - if is_positional and self.is_mixed(): + if is_positional: try: # Validate start & stop if start is not None: @@ -3175,8 +3157,18 @@ def is_int(v): pass if com.is_null_slice(key): + # It doesn't matter if we are positional or label based indexer = key elif is_positional: + if kind == "loc": + # GH#16121, GH#24612, GH#31810 + warnings.warn( + "Slicing a positional slice with .loc is not supported, " + "and will raise TypeError in a future version. " + "Use .loc with labels or .iloc with positions instead.", + FutureWarning, + stacklevel=6, + ) indexer = key else: indexer = self.slice_indexer(start, stop, step, kind=kind) @@ -3301,16 +3293,14 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): preserve_names = not hasattr(target, "name") # GH7774: preserve dtype/tz if target is empty and not an Index. - target = _ensure_has_len(target) # target may be an iterator + target = ensure_has_len(target) # target may be an iterator if not isinstance(target, Index) and len(target) == 0: - attrs = self._get_attributes_dict() - attrs.pop("freq", None) # don't preserve freq if isinstance(self, ABCRangeIndex): values = range(0) else: values = self._data[:0] # appropriately-dtyped empty array - target = self._simple_new(values, **attrs) + target = self._simple_new(values, name=self.name) else: target = ensure_index(target) @@ -3423,6 +3413,7 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False) ------- join_index, (left_indexer, right_indexer) """ + other = ensure_index(other) self_is_mi = isinstance(self, ABCMultiIndex) other_is_mi = isinstance(other, ABCMultiIndex) @@ -3442,8 +3433,6 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False) other, level, how=how, return_indexers=return_indexers ) - other = ensure_index(other) - if len(other) == 0 and how in ("left", "outer"): join_index = self._shallow_copy() if return_indexers: @@ -3605,16 +3594,26 @@ def _join_multi(self, other, how, return_indexers=True): def _join_non_unique(self, other, how="left", return_indexers=False): from pandas.core.reshape.merge import _get_join_indexers + # We only get here if dtypes match + assert self.dtype == other.dtype + + if is_extension_array_dtype(self.dtype): + lvalues = self._data._values_for_argsort() + rvalues = other._data._values_for_argsort() + else: + lvalues = self._values + rvalues = other._values + left_idx, right_idx = _get_join_indexers( - [self._ndarray_values], [other._ndarray_values], how=how, sort=True + [lvalues], [rvalues], how=how, sort=True ) left_idx = ensure_platform_int(left_idx) right_idx = ensure_platform_int(right_idx) - join_index = np.asarray(self._ndarray_values.take(left_idx)) + join_index = np.asarray(lvalues.take(left_idx)) mask = left_idx == -1 - np.putmask(join_index, mask, other._ndarray_values.take(right_idx)) + np.putmask(join_index, mask, rvalues.take(right_idx)) join_index = self._wrap_joined_index(join_index, other) @@ -3765,6 +3764,9 @@ def _get_leaf_sorter(labels): return join_index def _join_monotonic(self, other, how="left", return_indexers=False): + # We only get here with matching dtypes + assert other.dtype == self.dtype + if self.equals(other): ret_index = other if how == "right" else self if return_indexers: @@ -3772,8 +3774,12 @@ def _join_monotonic(self, other, how="left", return_indexers=False): else: return ret_index - sv = self._ndarray_values - ov = other._ndarray_values + if is_extension_array_dtype(self.dtype): + sv = self._data._values_for_argsort() + ov = other._data._values_for_argsort() + else: + sv = self._values + ov = other._values if self.is_unique and other.is_unique: # We can perform much better than the general case @@ -3839,7 +3845,7 @@ def values(self) -> np.ndarray: return self._data.view(np.ndarray) @cache_readonly - @Appender(IndexOpsMixin.array.__doc__) # type: ignore + @doc(IndexOpsMixin.array) def array(self) -> ExtensionArray: array = self._data if isinstance(array, np.ndarray): @@ -3853,77 +3859,34 @@ def _values(self) -> Union[ExtensionArray, np.ndarray]: """ The best array representation. - This is an ndarray or ExtensionArray. This differs from - ``_ndarray_values``, which always returns an ndarray. + This is an ndarray or ExtensionArray. - Both ``_values`` and ``_ndarray_values`` are consistent between - ``Series`` and ``Index`` (except for datetime64[ns], which returns - a DatetimeArray for _values on the Index, but ndarray[M8ns] on the - Series). + ``_values`` are consistent between``Series`` and ``Index``. It may differ from the public '.values' method. - index | values | _values | _ndarray_values | - ----------------- | --------------- | ------------- | --------------- | - Index | ndarray | ndarray | ndarray | - CategoricalIndex | Categorical | Categorical | ndarray[int] | - DatetimeIndex | ndarray[M8ns] | DatetimeArray | ndarray[M8ns] | - DatetimeIndex[tz] | ndarray[M8ns] | DatetimeArray | ndarray[M8ns] | - PeriodIndex | ndarray[object] | PeriodArray | ndarray[int] | - IntervalIndex | IntervalArray | IntervalArray | ndarray[object] | + index | values | _values | + ----------------- | --------------- | ------------- | + Index | ndarray | ndarray | + CategoricalIndex | Categorical | Categorical | + DatetimeIndex | ndarray[M8ns] | DatetimeArray | + DatetimeIndex[tz] | ndarray[M8ns] | DatetimeArray | + PeriodIndex | ndarray[object] | PeriodArray | + IntervalIndex | IntervalArray | IntervalArray | See Also -------- values - _ndarray_values """ return self._data - def _internal_get_values(self) -> np.ndarray: + def _get_engine_target(self) -> np.ndarray: """ - Return `Index` data as an `numpy.ndarray`. - - Returns - ------- - numpy.ndarray - A one-dimensional numpy array of the `Index` values. - - See Also - -------- - Index.values : The attribute that _internal_get_values wraps. - - Examples - -------- - Getting the `Index` values of a `DataFrame`: - - >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - ... index=['a', 'b', 'c'], columns=['A', 'B', 'C']) - >>> df - A B C - a 1 2 3 - b 4 5 6 - c 7 8 9 - >>> df.index._internal_get_values() - array(['a', 'b', 'c'], dtype=object) - - Standalone `Index` values: - - >>> idx = pd.Index(['1', '2', '3']) - >>> idx._internal_get_values() - array(['1', '2', '3'], dtype=object) - - `MultiIndex` arrays also have only one dimension: - - >>> midx = pd.MultiIndex.from_arrays([[1, 2, 3], ['a', 'b', 'c']], - ... names=('number', 'letter')) - >>> midx._internal_get_values() - array([(1, 'a'), (2, 'b'), (3, 'c')], dtype=object) - >>> midx._internal_get_values().ndim - 1 + Get the ndarray that we can pass to the IndexEngine constructor. """ - return self.values + return self._values - @Appender(IndexOpsMixin.memory_usage.__doc__) + @doc(IndexOpsMixin.memory_usage) def memory_usage(self, deep: bool = False) -> int: result = super().memory_usage(deep=deep) @@ -3933,18 +3896,35 @@ def memory_usage(self, deep: bool = False) -> int: def where(self, cond, other=None): """ - Return an Index of same shape as self and whose corresponding - entries are from self where cond is True and otherwise are from - other. + Replace values where the condition is False. + + The replacement is taken from other. Parameters ---------- cond : bool array-like with the same length as self - other : scalar, or array-like + Condition to select the values on. + other : scalar, or array-like, default None + Replacement if the condition is False. Returns ------- - Index + pandas.Index + A copy of self with values replaced from other + where the condition is False. + + See Also + -------- + Series.where : Same method for Series. + DataFrame.where : Same method for DataFrame. + + Examples + -------- + >>> idx = pd.Index(['car', 'bike', 'train', 'tractor']) + >>> idx + Index(['car', 'bike', 'train', 'tractor'], dtype='object') + >>> idx.where(idx.isin(['car', 'train']), 'other') + Index(['car', 'other', 'train', 'other'], dtype='object') """ if other is None: other = self._na_value @@ -4115,7 +4095,6 @@ def __getitem__(self, key): if com.is_bool_indexer(key): key = np.asarray(key, dtype=bool) - key = com.values_from_object(key) result = getitem(key) if not is_scalar(result): if np.ndim(result) > 1: @@ -4213,6 +4192,9 @@ def putmask(self, mask, value): values = self.values.copy() try: np.putmask(values, mask, self._convert_for_op(value)) + if is_period_dtype(self.dtype): + # .values cast to object, so we need to cast back + values = type(self)(values)._data return self._shallow_copy(values) except (ValueError, TypeError) as err: if is_object_dtype(self): @@ -4221,15 +4203,64 @@ def putmask(self, mask, value): # coerces to object return self.astype(object).putmask(mask, value) - def equals(self, other) -> bool: + def equals(self, other: Any) -> bool: """ - Determine if two Index objects contain the same elements. + Determine if two Index object are equal. + + The things that are being compared are: + + * The elements inside the Index object. + * The order of the elements inside the Index object. + + Parameters + ---------- + other : Any + The other object to compare against. Returns ------- bool - True if "other" is an Index and it has the same elements as calling - index; False otherwise. + True if "other" is an Index and it has the same elements and order + as the calling index; False otherwise. + + Examples + -------- + >>> idx1 = pd.Index([1, 2, 3]) + >>> idx1 + Int64Index([1, 2, 3], dtype='int64') + >>> idx1.equals(pd.Index([1, 2, 3])) + True + + The elements inside are compared + + >>> idx2 = pd.Index(["1", "2", "3"]) + >>> idx2 + Index(['1', '2', '3'], dtype='object') + + >>> idx1.equals(idx2) + False + + The order is compared + + >>> ascending_idx = pd.Index([1, 2, 3]) + >>> ascending_idx + Int64Index([1, 2, 3], dtype='int64') + >>> descending_idx = pd.Index([3, 2, 1]) + >>> descending_idx + Int64Index([3, 2, 1], dtype='int64') + >>> ascending_idx.equals(descending_idx) + False + + The dtype is *not* compared + + >>> int64_idx = pd.Int64Index([1, 2, 3]) + >>> int64_idx + Int64Index([1, 2, 3], dtype='int64') + >>> uint64_idx = pd.UInt64Index([1, 2, 3]) + >>> uint64_idx + UInt64Index([1, 2, 3], dtype='uint64') + >>> int64_idx.equals(uint64_idx) + True """ if self.is_(other): return True @@ -4237,19 +4268,19 @@ def equals(self, other) -> bool: if not isinstance(other, Index): return False - if is_object_dtype(self) and not is_object_dtype(other): + if is_object_dtype(self.dtype) and not is_object_dtype(other.dtype): # if other is not object, use other's logic for coercion return other.equals(self) if isinstance(other, ABCMultiIndex): # d-level MultiIndex can equal d-tuple Index - if not is_object_dtype(self.dtype): - if self.nlevels != other.nlevels: - return False + return other.equals(self) - return array_equivalent( - com.values_from_object(self), com.values_from_object(other) - ) + if is_extension_array_dtype(other.dtype): + # All EA-backed Index subclasses override equals + return other.equals(self) + + return array_equivalent(self._values, other._values) def identical(self, other) -> bool: """ @@ -4544,10 +4575,7 @@ def get_value(self, series: "Series", key): ------- scalar or Series """ - if not is_scalar(key): - # if key is not a scalar, directly raise an error (the code below - # would convert to numpy arrays and raise later any way) - GH29926 - raise InvalidIndexError(key) + self._check_indexing_error(key) try: # GH 20882, 21257 @@ -4568,6 +4596,12 @@ def get_value(self, series: "Series", key): return self._get_values_for_loc(series, loc, key) + def _check_indexing_error(self, key): + if not is_scalar(key): + # if key is not a scalar, directly raise an error (the code below + # would convert to numpy arrays and raise later any way) - GH29926 + raise InvalidIndexError(key) + def _should_fallback_to_positional(self) -> bool: """ If an integer key is not found, should we fall back to positional indexing? @@ -4641,12 +4675,10 @@ def get_indexer_non_unique(self, target): if pself is not self or ptarget is not target: return pself.get_indexer_non_unique(ptarget) - if is_categorical(target): + if is_categorical_dtype(target.dtype): tgt_values = np.asarray(target) - elif self.is_all_dates and target.is_all_dates: # GH 30399 - tgt_values = target.asi8 else: - tgt_values = target._ndarray_values + tgt_values = target._get_engine_target() indexer, missing = self._engine.get_indexer_non_unique(tgt_values) return ensure_platform_int(indexer), missing @@ -4746,6 +4778,27 @@ def map(self, mapper, na_action=None): return Index(new_values, **attributes) + # TODO: De-duplicate with map, xref GH#32349 + def _transform_index(self, func, level=None) -> "Index": + """ + Apply function to all values found in index. + + This includes transforming multiindex entries separately. + Only apply function to one level of the MultiIndex if level is specified. + """ + if isinstance(self, ABCMultiIndex): + if level is not None: + items = [ + tuple(func(y) if i == level else y for i, y in enumerate(x)) + for x in self + ] + else: + items = [tuple(func(y) for y in x) for x in self] + return type(self).from_tuples(items, names=self.names) + else: + items = [func(x) for x in self] + return Index(items, name=self.name, tupleize_cols=False) + def isin(self, values, level=None): """ Return a boolean array where the index values are in `values`. @@ -4791,6 +4844,7 @@ def isin(self, values, level=None): Int64Index([1, 2, 3], dtype='int64') Check whether each index value in a list of values. + >>> idx.isin([1, 4]) array([ True, False, False]) @@ -4798,8 +4852,9 @@ def isin(self, values, level=None): ... ['red', 'blue', 'green']], ... names=('number', 'color')) >>> midx - MultiIndex(levels=[[1, 2, 3], ['blue', 'green', 'red']], - codes=[[0, 1, 2], [2, 0, 1]], + MultiIndex([(1, 'red'), + (2, 'blue'), + (3, 'green')], names=['number', 'color']) Check whether the strings in the 'color' level of the MultiIndex @@ -4867,11 +4922,11 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): >>> idx = pd.Index(list('abcd')) >>> idx.slice_indexer(start='b', end='c') - slice(1, 3) + slice(1, 3, None) >>> idx = pd.MultiIndex.from_arrays([list('abcd'), list('efgh')]) >>> idx.slice_indexer(start='b', end=('c', 'g')) - slice(1, 3) + slice(1, 3, None) """ start_slice, end_slice = self.slice_locs(start, end, step=step, kind=kind) @@ -5157,9 +5212,11 @@ def insert(self, loc: int, item): ------- new_index : Index """ - _self = np.asarray(self) - item = self._coerce_scalar_to_index(item)._ndarray_values - idx = np.concatenate((_self[:loc], item, _self[loc:])) + # Note: this method is overridden by all ExtensionIndex subclasses, + # so self is never backed by an EA. + arr = np.asarray(self) + item = self._coerce_scalar_to_index(item)._values + idx = np.concatenate((arr[:loc], item, arr[loc:])) return self._shallow_copy_with_infer(idx) def drop(self, labels, errors: str_t = "raise"): @@ -5267,7 +5324,7 @@ def _add_numeric_methods_unary(cls): Add in numeric unary methods. """ - def _make_evaluate_unary(op, opstr): + def _make_evaluate_unary(op, opstr: str_t): def _evaluate_numeric_unary(self): attrs = self._get_attributes_dict() @@ -5373,7 +5430,7 @@ def _add_logical_methods(cls): """ ) - def _make_logical_function(name, desc, f): + def _make_logical_function(name: str_t, desc: str_t, f): @Substitution(outname=name, desc=desc) @Appender(_index_shared_docs["index_" + name]) @Appender(_doc) @@ -5440,11 +5497,10 @@ def ensure_index_from_sequences(sequences, names=None): Examples -------- - >>> ensure_index_from_sequences([[1, 2, 3]], names=['name']) + >>> ensure_index_from_sequences([[1, 2, 3]], names=["name"]) Int64Index([1, 2, 3], dtype='int64', name='name') - >>> ensure_index_from_sequences([['a', 'a'], ['a', 'b']], - names=['L1', 'L2']) + >>> ensure_index_from_sequences([["a", "a"], ["a", "b"]], names=["L1", "L2"]) MultiIndex([('a', 'a'), ('a', 'b')], names=['L1', 'L2']) @@ -5463,20 +5519,24 @@ def ensure_index_from_sequences(sequences, names=None): return MultiIndex.from_arrays(sequences, names=names) -def ensure_index(index_like, copy=False): +def ensure_index(index_like, copy: bool = False): """ Ensure that we have an index from some index-like object. Parameters ---------- - index : sequence + index_like : sequence An Index or other sequence - copy : bool + copy : bool, default False Returns ------- index : Index or MultiIndex + See Also + -------- + ensure_index_from_sequences + Examples -------- >>> ensure_index(['a', 'b']) @@ -5487,13 +5547,8 @@ def ensure_index(index_like, copy=False): >>> ensure_index([['a', 'a'], ['b', 'c']]) MultiIndex([('a', 'b'), - ('a', 'c')], - dtype='object') - ) - - See Also - -------- - ensure_index_from_sequences + ('a', 'c')], + ) """ if isinstance(index_like, Index): if copy: @@ -5523,14 +5578,12 @@ def ensure_index(index_like, copy=False): # clean_index_list does the equivalent of copying # so only need to do this if not list instance if copy: - from copy import copy - - index_like = copy(index_like) + index_like = copy_func(index_like) return Index(index_like) -def _ensure_has_len(seq): +def ensure_has_len(seq): """ If seq is an iterator, put its values into a list. """ @@ -5552,7 +5605,7 @@ def _trim_front(strings): return trimmed -def _validate_join_method(method): +def _validate_join_method(method: str): if method not in ["left", "right", "inner", "outer"]: raise ValueError(f"do not recognize join method {method}") @@ -5563,7 +5616,7 @@ def default_index(n): return RangeIndex(0, n, name=None) -def maybe_extract_name(name, obj, cls) -> Optional[Hashable]: +def maybe_extract_name(name, obj, cls) -> Label: """ If no name is passed, then extract it from data, validating hashability. """ diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 5f0d6ea2d6278..635bf32639075 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -9,7 +9,7 @@ from pandas._libs.hashtable import duplicated_int64 from pandas._libs.lib import no_default from pandas._typing import Label -from pandas.util._decorators import Appender, cache_readonly +from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.common import ( ensure_platform_int, @@ -23,12 +23,13 @@ from pandas.core import accessor from pandas.core.algorithms import take_1d -from pandas.core.arrays.categorical import Categorical, _recode_for_categories, contains +from pandas.core.arrays.categorical import Categorical, contains, recode_for_categories import pandas.core.common as com import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs, maybe_extract_name from pandas.core.indexes.extension import ExtensionIndex, inherit_names import pandas.core.missing as missing +from pandas.core.ops import get_op_result_name _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update(dict(target_klass="CategoricalIndex")) @@ -90,8 +91,6 @@ class CategoricalIndex(ExtensionIndex, accessor.PandasDelegate): dtype : CategoricalDtype or "category", optional If :class:`CategoricalDtype`, cannot be used together with `categories` or `ordered`. - - .. versionadded:: 0.21.0 copy : bool, default False Make a copy of input ndarray. name : object, optional @@ -137,21 +136,25 @@ class CategoricalIndex(ExtensionIndex, accessor.PandasDelegate): Examples -------- - >>> pd.CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c']) - CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], categories=['a', 'b', 'c'], ordered=False, dtype='category') # noqa + >>> pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]) + CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], + categories=['a', 'b', 'c'], ordered=False, dtype='category') ``CategoricalIndex`` can also be instantiated from a ``Categorical``: - >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c']) + >>> c = pd.Categorical(["a", "b", "c", "a", "b", "c"]) >>> pd.CategoricalIndex(c) - CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], categories=['a', 'b', 'c'], ordered=False, dtype='category') # noqa + CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], + categories=['a', 'b', 'c'], ordered=False, dtype='category') Ordered ``CategoricalIndex`` can have a min and max value. - >>> ci = pd.CategoricalIndex(['a','b','c','a','b','c'], ordered=True, - ... categories=['c', 'b', 'a']) + >>> ci = pd.CategoricalIndex( + ... ["a", "b", "c", "a", "b", "c"], ordered=True, categories=["c", "b", "a"] + ... ) >>> ci - CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], categories=['c', 'b', 'a'], ordered=True, dtype='category') # noqa + CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], + categories=['c', 'b', 'a'], ordered=True, dtype='category') >>> ci.min() 'c' """ @@ -232,6 +235,7 @@ def _simple_new(cls, values: Categorical, name: Label = None): result._data = values result.name = name + result._cache = {} result._reset_identity() result._no_setting_name = False @@ -239,16 +243,14 @@ def _simple_new(cls, values: Categorical, name: Label = None): # -------------------------------------------------------------------- - @Appender(Index._shallow_copy.__doc__) + @doc(Index._shallow_copy) def _shallow_copy(self, values=None, name: Label = no_default): name = self.name if name is no_default else name - if values is None: - values = self.values - - cat = Categorical(values, dtype=self.dtype) + if values is not None: + values = Categorical(values, dtype=self.dtype) - return type(self)._simple_new(cat, name=name) + return super()._shallow_copy(values=values, name=name) def _is_dtype_compat(self, other) -> bool: """ @@ -354,7 +356,7 @@ def _has_complex_internals(self) -> bool: # used to avoid libreduction code paths, which raise or require conversion return True - @Appender(Index.__contains__.__doc__) + @doc(Index.__contains__) def __contains__(self, key: Any) -> bool: # if key is a NaN, check if any NaN is in self. if is_scalar(key) and isna(key): @@ -363,11 +365,7 @@ def __contains__(self, key: Any) -> bool: hash(key) return contains(self, key, container=self._engine) - def __array__(self, dtype=None) -> np.ndarray: - """ the array interface, return my values """ - return np.array(self._data, dtype=dtype) - - @Appender(Index.astype.__doc__) + @doc(Index.astype) def astype(self, dtype, copy=True): if is_interval_dtype(dtype): from pandas import IntervalIndex @@ -386,7 +384,7 @@ def _isnan(self): """ return if each value is nan""" return self._data.codes == -1 - @Appender(Index.fillna.__doc__) + @doc(Index.fillna) def fillna(self, value, downcast=None): self._assert_can_do_op(value) return CategoricalIndex(self._data.fillna(value), name=self.name) @@ -399,16 +397,16 @@ def _engine(self): codes = self.codes return self._engine_type(lambda: codes, len(self)) - @Appender(Index.unique.__doc__) + @doc(Index.unique) def unique(self, level=None): if level is not None: self._validate_index_level(level) - result = self.values.unique() + result = self._values.unique() # Use _simple_new instead of _shallow_copy to ensure we keep dtype # of result, not self. return type(self)._simple_new(result, name=self.name) - @Appender(Index.duplicated.__doc__) + @doc(Index.duplicated) def duplicated(self, keep="first"): codes = self.codes.astype("i8") return duplicated_int64(codes, keep) @@ -422,7 +420,7 @@ def _maybe_cast_indexer(self, key): code = self.codes.dtype.type(code) return code - @Appender(Index.where.__doc__) + @doc(Index.where) def where(self, cond, other=None): # TODO: Investigate an alternative implementation with # 1. copy the underlying Categorical @@ -430,7 +428,7 @@ def where(self, cond, other=None): # 3. Rebuild CategoricalIndex. if other is None: other = self._na_value - values = np.where(cond, self.values, other) + values = np.where(cond, self._values, other) cat = Categorical(values, dtype=self.dtype) return type(self)._simple_new(cat, name=self.name) @@ -539,13 +537,13 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): "method='nearest' not implemented yet for CategoricalIndex" ) - if isinstance(target, CategoricalIndex) and self.values.is_dtype_equal(target): - if self.values.equals(target.values): + if isinstance(target, CategoricalIndex) and self._values.is_dtype_equal(target): + if self._values.equals(target._values): # we have the same codes codes = target.codes else: - codes = _recode_for_categories( - target.codes, target.categories, self.values.categories + codes = recode_for_categories( + target.codes, target.categories, self._values.categories ) else: if isinstance(target, CategoricalIndex): @@ -567,23 +565,13 @@ def get_indexer_non_unique(self, target): target = target.codes indexer, missing = self._engine.get_indexer_non_unique(target) return ensure_platform_int(indexer), missing - target = target.values + target = target._values codes = self.categories.get_indexer(target) indexer, missing = self._engine.get_indexer_non_unique(codes) return ensure_platform_int(indexer), missing - @Appender(Index._convert_scalar_indexer.__doc__) - def _convert_scalar_indexer(self, key, kind: str): - assert kind in ["loc", "getitem"] - if kind == "loc": - try: - return self.categories._convert_scalar_indexer(key, kind="loc") - except TypeError: - self._invalid_indexer("label", key) - return super()._convert_scalar_indexer(key, kind=kind) - - @Appender(Index._convert_list_indexer.__doc__) + @doc(Index._convert_list_indexer) def _convert_list_indexer(self, keyarr): # Return our indexer or raise if all of the values are not included in # the categories @@ -600,7 +588,7 @@ def _convert_list_indexer(self, keyarr): return self.get_indexer(keyarr) - @Appender(Index._convert_arr_indexer.__doc__) + @doc(Index._convert_arr_indexer) def _convert_arr_indexer(self, keyarr): keyarr = com.asarray_tuplesafe(keyarr) @@ -609,7 +597,7 @@ def _convert_arr_indexer(self, keyarr): return self._shallow_copy(keyarr) - @Appender(Index._convert_index_indexer.__doc__) + @doc(Index._convert_index_indexer) def _convert_index_indexer(self, keyarr): return self._shallow_copy(keyarr) @@ -622,7 +610,7 @@ def take_nd(self, *args, **kwargs): ) return self.take(*args, **kwargs) - @Appender(Index._maybe_cast_slice_bound.__doc__) + @doc(Index._maybe_cast_slice_bound) def _maybe_cast_slice_bound(self, label, side, kind): if kind == "loc": return label @@ -666,7 +654,7 @@ def map(self, mapper): >>> idx = pd.CategoricalIndex(['a', 'b', 'c']) >>> idx CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'], - ordered=False, dtype='category') + ordered=False, dtype='category') >>> idx.map(lambda x: x.upper()) CategoricalIndex(['A', 'B', 'C'], categories=['A', 'B', 'C'], ordered=False, dtype='category') @@ -696,7 +684,7 @@ def map(self, mapper): >>> idx.map({'a': 'first', 'b': 'second'}) Index(['first', 'second', nan], dtype='object') """ - return self._shallow_copy_with_infer(self.values.map(mapper)) + return self._shallow_copy_with_infer(self._values.map(mapper)) def delete(self, loc): """ @@ -763,6 +751,12 @@ def _delegate_method(self, name: str, *args, **kwargs): return res return CategoricalIndex(res, name=self.name) + def _wrap_joined_index( + self, joined: np.ndarray, other: "CategoricalIndex" + ) -> "CategoricalIndex": + name = get_op_result_name(self, other) + return self._create_from_codes(joined, name=name) + CategoricalIndex._add_numeric_methods_add_sub_disabled() CategoricalIndex._add_numeric_methods_disabled() diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 72a2aba2d8a88..25333b3a08dce 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -8,22 +8,21 @@ from pandas._libs import NaT, iNaT, join as libjoin, lib from pandas._libs.tslibs import timezones -from pandas._typing import Label +from pandas._typing import DtypeObj, Label from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError -from pandas.util._decorators import Appender, cache_readonly +from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.common import ( ensure_int64, + ensure_platform_int, is_bool_dtype, is_categorical_dtype, is_dtype_equal, - is_float, is_integer, is_list_like, is_period_dtype, is_scalar, - needs_i8_conversion, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries @@ -32,9 +31,9 @@ from pandas.core import algorithms from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin -from pandas.core.base import _shared_docs +from pandas.core.base import IndexOpsMixin import pandas.core.indexes.base as ibase -from pandas.core.indexes.base import Index, _index_shared_docs +from pandas.core.indexes.base import Index, _index_shared_docs, ensure_index from pandas.core.indexes.extension import ( ExtensionIndex, inherit_names, @@ -44,7 +43,7 @@ from pandas.core.ops import get_op_result_name from pandas.core.tools.timedeltas import to_timedelta -from pandas.tseries.frequencies import DateOffset, to_offset +from pandas.tseries.frequencies import DateOffset _index_doc_kwargs = dict(ibase._index_doc_kwargs) @@ -103,6 +102,12 @@ class DatetimeIndexOpsMixin(ExtensionIndex): def is_all_dates(self) -> bool: return True + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + """ + Can we compare values of the given dtype to our own? + """ + raise AbstractMethodError(self) + # ------------------------------------------------------------------------ # Abstract data attributes @@ -174,7 +179,7 @@ def sort_values(self, return_indexer=False, ascending=True): sorted_index = self.take(_as) return sorted_index, _as else: - # NB: using asi8 instead of _ndarray_values matters in numpy 1.18 + # NB: using asi8 instead of _data matters in numpy 1.18 # because the treatment of NaT has been changed to put NaT last # instead of first. sorted_values = np.sort(self.asi8) @@ -207,7 +212,7 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): self, indices, axis, allow_fill, fill_value, **kwargs ) - @Appender(_shared_docs["searchsorted"]) + @doc(IndexOpsMixin.searchsorted, klass="Datetime-like Index") def searchsorted(self, value, side="left", sorter=None): if isinstance(value, str): raise TypeError( @@ -377,32 +382,6 @@ def _format_attrs(self): # -------------------------------------------------------------------- # Indexing Methods - def _convert_scalar_indexer(self, key, kind: str): - """ - We don't allow integer or float indexing on datetime-like when using - loc. - - Parameters - ---------- - key : label of the slice bound - kind : {'loc', 'getitem'} - """ - assert kind in ["loc", "getitem"] - - if not is_scalar(key): - raise TypeError(key) - - # we don't allow integer/float indexing for loc - # we don't allow float indexing for getitem - is_int = is_integer(key) - is_flt = is_float(key) - if kind == "loc" and (is_int or is_flt): - self._invalid_indexer("label", key) - elif kind == "getitem" and is_flt: - self._invalid_indexer("label", key) - - return super()._convert_scalar_indexer(key, kind=kind) - def _validate_partial_date_slice(self, reso: str): raise NotImplementedError @@ -454,6 +433,21 @@ def _partial_date_slice( # try to find the dates return (lhs_mask & rhs_mask).nonzero()[0] + @Appender(Index.get_indexer_non_unique.__doc__) + def get_indexer_non_unique(self, target): + target = ensure_index(target) + pself, ptarget = self._maybe_promote(target) + if pself is not self or ptarget is not target: + return pself.get_indexer_non_unique(ptarget) + + if not self._is_comparable_dtype(target.dtype): + no_matches = -1 * np.ones(self.shape, dtype=np.intp) + return no_matches, no_matches + + tgt_values = target.asi8 + indexer, missing = self._engine.get_indexer_non_unique(tgt_values) + return ensure_platform_int(indexer), missing + # -------------------------------------------------------------------- __add__ = make_wrapped_arith_op("__add__") @@ -511,7 +505,7 @@ def where(self, cond, other=None): if is_categorical_dtype(other): # e.g. we have a Categorical holding self.dtype - if needs_i8_conversion(other.categories): + if is_dtype_equal(other.categories.dtype, self.dtype): other = other._internal_get_values() if not is_dtype_equal(self.dtype, other.dtype): @@ -520,7 +514,8 @@ def where(self, cond, other=None): other = other.view("i8") result = np.where(cond, values, other).astype("i8") - return self._shallow_copy(result) + arr = type(self._data)._simple_new(result, dtype=self.dtype) + return type(self)._simple_new(arr, name=self.name) def _summary(self, name=None) -> str: """ @@ -628,41 +623,22 @@ def _set_freq(self, freq): freq : DateOffset, None, or "infer" """ # GH#29843 - if freq is None: - # Always valid - pass - elif len(self) == 0 and isinstance(freq, DateOffset): - # Always valid. In the TimedeltaIndex case, we assume this - # is a Tick offset. - pass - else: - # As an internal method, we can ensure this assertion always holds - assert freq == "infer" - freq = to_offset(self.inferred_freq) - - self._data._freq = freq + self._data._with_freq(freq) def _shallow_copy(self, values=None, name: Label = lib.no_default): name = self.name if name is lib.no_default else name + cache = self._cache.copy() if values is None else {} if values is None: values = self._data - if isinstance(values, type(self)): - values = values._data if isinstance(values, np.ndarray): # TODO: We would rather not get here values = type(self._data)(values, dtype=self.dtype) - attributes = self._get_attributes_dict() - - if self.freq is not None: - if isinstance(values, (DatetimeArray, TimedeltaArray)): - if values.freq is None: - del attributes["freq"] - - attributes["name"] = name - return type(self)._simple_new(values, **attributes) + result = type(self)._simple_new(values, name=name) + result._cache = cache + return result # -------------------------------------------------------------------- # Set Operation Methods @@ -804,7 +780,10 @@ def _fast_union(self, other, sort=None): loc = right.searchsorted(left_start, side="left") right_chunk = right.values[:loc] dates = concat_compat((left.values, right_chunk)) - return self._shallow_copy(dates) + result = self._shallow_copy(dates) + result._set_freq("infer") + # TODO: can we infer that it has self.freq? + return result else: left, right = other, self @@ -816,7 +795,10 @@ def _fast_union(self, other, sort=None): loc = right.searchsorted(left_end, side="right") right_chunk = right.values[loc:] dates = concat_compat((left.values, right_chunk)) - return self._shallow_copy(dates) + result = self._shallow_copy(dates) + result._set_freq("infer") + # TODO: can we infer that it has self.freq? + return result else: return left @@ -967,11 +949,11 @@ def insert(self, loc, item): ) arr = type(self._data)._simple_new(new_i8s, dtype=self.dtype, freq=freq) return type(self)._simple_new(arr, name=self.name) - except (AttributeError, TypeError): + except (AttributeError, TypeError) as err: # fall back to object index if isinstance(item, str): return self.astype(object).insert(loc, item) raise TypeError( f"cannot insert {type(self).__name__} with incompatible label" - ) + ) from err diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index e303e487b1a7d..1ec6cf8fd7b4e 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -7,17 +7,21 @@ from pandas._libs import NaT, Period, Timestamp, index as libindex, lib, tslib as libts from pandas._libs.tslibs import fields, parsing, timezones +from pandas._typing import DtypeObj, Label from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.common import _NS_DTYPE, is_float, is_integer, is_scalar -from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.common import ( + DT64NS_DTYPE, + is_datetime64_any_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_float, + is_integer, + is_scalar, +) from pandas.core.dtypes.missing import is_valid_nat_for_dtype -from pandas.core.arrays.datetimes import ( - DatetimeArray, - tz_to_dtype, - validate_tz_from_dtype, -) +from pandas.core.arrays.datetimes import DatetimeArray, tz_to_dtype import pandas.core.common as com from pandas.core.indexes.base import Index, InvalidIndexError, maybe_extract_name from pandas.core.indexes.datetimelike import DatetimeTimedeltaMixin @@ -36,7 +40,20 @@ def _new_DatetimeIndex(cls, d): if "data" in d and not isinstance(d["data"], DatetimeIndex): # Avoid need to verify integrity by calling simple_new directly data = d.pop("data") - result = cls._simple_new(data, **d) + if not isinstance(data, DatetimeArray): + # For backward compat with older pickles, we may need to construct + # a DatetimeArray to adapt to the newer _simple_new signature + tz = d.pop("tz") + freq = d.pop("freq") + dta = DatetimeArray._simple_new(data, dtype=tz_to_dtype(tz), freq=freq) + else: + dta = data + for key in ["tz", "freq"]: + # These are already stored in our DatetimeArray; if they are + # also in the pickle and don't match, we have a problem. + if key in d: + assert d.pop(key) == getattr(dta, key) + result = cls._simple_new(dta, **d) else: with warnings.catch_warnings(): # TODO: If we knew what was going in to **d, we might be able to @@ -72,27 +89,33 @@ def _new_DatetimeIndex(cls, d): "date", "time", "timetz", + "isocalendar", ] + DatetimeArray._bool_ops, DatetimeArray, ) class DatetimeIndex(DatetimeTimedeltaMixin): """ - Immutable ndarray of datetime64 data, represented internally as int64, and - which can be boxed to Timestamp objects that are subclasses of datetime and - carry metadata such as frequency information. + Immutable ndarray-like of datetime64 data. + + Represented internally as int64, and which can be boxed to Timestamp objects + that are subclasses of datetime and carry metadata. Parameters ---------- data : array-like (1-dimensional), optional Optional datetime-like data to construct index with. - copy : bool - Make a copy of input ndarray. freq : str or pandas offset object, optional One of pandas date offset strings or corresponding objects. The string 'infer' can be passed in order to set the frequency of the index as the inferred frequency upon creation. - tz : pytz.timezone or dateutil.tz.tzfile + tz : pytz.timezone or dateutil.tz.tzfile or datetime.tzinfo or str + Set the Timezone of the data. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. + closed : {'left', 'right'}, optional + Set whether to include `start` and `end` that are on the + boundary. The default includes boundary points on either end. ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' When clocks moved backward due to DST, ambiguous times may arise. For example in Central European Time (UTC+01), when going from 03:00 @@ -107,12 +130,16 @@ class DatetimeIndex(DatetimeTimedeltaMixin): times) - 'NaT' will return NaT where there are ambiguous times - 'raise' will raise an AmbiguousTimeError if there are ambiguous times. - name : object - Name to be stored in the index. dayfirst : bool, default False If True, parse dates in `data` with the day first order. yearfirst : bool, default False If True parse dates in `data` with the year first order. + dtype : numpy.dtype or DatetimeTZDtype or str, default None + Note that the only NumPy dtype allowed is ‘datetime64[ns]’. + copy : bool, default False + Make a copy of input ndarray. + name : label, default None + Name to be stored in the index. Attributes ---------- @@ -235,41 +262,21 @@ def __new__( return subarr @classmethod - def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None): - """ - We require the we have a dtype compat for the values - if we are passed a non-dtype compat, then coerce using the constructor - """ - if isinstance(values, DatetimeArray): - if tz: - tz = validate_tz_from_dtype(dtype, tz) - dtype = DatetimeTZDtype(tz=tz) - elif dtype is None: - dtype = _NS_DTYPE - - values = DatetimeArray(values, freq=freq, dtype=dtype) - tz = values.tz - freq = values.freq - values = values._data - - dtype = tz_to_dtype(tz) - dtarr = DatetimeArray._simple_new(values, freq=freq, dtype=dtype) - assert isinstance(dtarr, DatetimeArray) + def _simple_new(cls, values: DatetimeArray, name: Label = None): + assert isinstance(values, DatetimeArray), type(values) result = object.__new__(cls) - result._data = dtarr + result._data = values result.name = name + result._cache = {} result._no_setting_name = False # For groupby perf. See note in indexes/base about _index_data - result._index_data = dtarr._data + result._index_data = values._data result._reset_identity() return result # -------------------------------------------------------------------- - def __array__(self, dtype=None) -> np.ndarray: - return np.asarray(self._data, dtype=dtype) - @cache_readonly def _is_dates_only(self) -> bool: """ @@ -281,7 +288,7 @@ def _is_dates_only(self) -> bool: """ from pandas.io.formats.format import _is_dates_only - return _is_dates_only(self.values) and self.tz is None + return self.tz is None and _is_dates_only(self._values) def __reduce__(self): @@ -300,6 +307,18 @@ def _convert_for_op(self, value): return Timestamp(value).asm8 raise ValueError("Passed item and index have different timezone") + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + """ + Can we compare values of the given dtype to our own? + """ + if not is_datetime64_any_dtype(dtype): + return False + if self.tz is not None: + # If we have tz, we can compare to tzaware + return is_datetime64tz_dtype(dtype) + # if we dont have tz, we can only compare to tznaive + return is_datetime64_dtype(dtype) + # -------------------------------------------------------------------- # Rendering Methods @@ -436,7 +455,7 @@ def snap(self, freq="S"): # Superdumb, punting on any optimizing freq = to_offset(freq) - snapped = np.empty(len(self), dtype=_NS_DTYPE) + snapped = np.empty(len(self), dtype=DT64NS_DTYPE) for i, v in enumerate(self): s = v @@ -552,8 +571,8 @@ def get_loc(self, key, method=None, tolerance=None): try: key = self._maybe_cast_for_get_loc(key) - except ValueError: - raise KeyError(key) + except ValueError as err: + raise KeyError(key) from err elif isinstance(key, timedelta): # GH#20464 @@ -574,8 +593,8 @@ def get_loc(self, key, method=None, tolerance=None): try: return Index.get_loc(self, key, method, tolerance) - except KeyError: - raise KeyError(orig_key) + except KeyError as err: + raise KeyError(orig_key) from err def _maybe_cast_for_get_loc(self, key) -> Timestamp: # needed to localize naive datetimes @@ -702,7 +721,7 @@ def inferred_type(self) -> str: def indexer_at_time(self, time, asof=False): """ - Return index locations of index values at particular time of day + Return index locations of values at particular time of day (e.g. 9:30AM). Parameters @@ -718,7 +737,9 @@ def indexer_at_time(self, time, asof=False): See Also -------- - indexer_between_time, DataFrame.at_time + indexer_between_time : Get index locations of values between particular + times of day. + DataFrame.at_time : Select values at particular time of day. """ if asof: raise NotImplementedError("'asof' argument is not supported") @@ -759,7 +780,8 @@ def indexer_between_time( See Also -------- - indexer_at_time, DataFrame.between_time + indexer_at_time : Get index locations of values at particular time of day. + DataFrame.between_time : Select values between particular times of day. """ start_time = tools.to_time(start_time) end_time = tools.to_time(end_time) @@ -993,16 +1015,10 @@ def bdate_range( Weekmask of valid business days, passed to ``numpy.busdaycalendar``, only used when custom frequency strings are passed. The default value None is equivalent to 'Mon Tue Wed Thu Fri'. - - .. versionadded:: 0.21.0 - holidays : list-like or None, default None Dates to exclude from the set of valid business days, passed to ``numpy.busdaycalendar``, only used when custom frequency strings are passed. - - .. versionadded:: 0.21.0 - closed : str, default None Make the interval closed with respect to the given frequency to the 'left', 'right', or both sides (None). @@ -1040,9 +1056,9 @@ def bdate_range( try: weekmask = weekmask or "Mon Tue Wed Thu Fri" freq = prefix_mapping[freq](holidays=holidays, weekmask=weekmask) - except (KeyError, TypeError): + except (KeyError, TypeError) as err: msg = f"invalid custom frequency string: {freq}" - raise ValueError(msg) + raise ValueError(msg) from err elif holidays or weekmask: msg = ( "a custom frequency string is required when holidays or " diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index daccb35864e98..c752990531b34 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -6,7 +6,8 @@ import numpy as np from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender, cache_readonly +from pandas.errors import AbstractMethodError +from pandas.util._decorators import cache_readonly, doc from pandas.core.dtypes.common import ( ensure_platform_int, @@ -213,7 +214,10 @@ class ExtensionIndex(Index): def __getitem__(self, key): result = self._data[key] if isinstance(result, type(self._data)): - return type(self)(result, name=self.name) + if result.ndim == 1: + return type(self)(result, name=self.name) + # Unpack to ndarray for MPL compat + result = result._data # Includes cases where we get a 2D ndarray back for MPL compat deprecate_ndim_indexing(result) @@ -224,11 +228,13 @@ def __iter__(self): # --------------------------------------------------------------------- - @property - def _ndarray_values(self) -> np.ndarray: - return self._data._ndarray_values + def __array__(self, dtype=None) -> np.ndarray: + return np.asarray(self._data, dtype=dtype) - @Appender(Index.dropna.__doc__) + def _get_engine_target(self) -> np.ndarray: + return self._data._values_for_argsort() + + @doc(Index.dropna) def dropna(self, how="any"): if how not in ("any", "all"): raise ValueError(f"invalid how option: {how}") @@ -242,11 +248,15 @@ def repeat(self, repeats, axis=None): result = self._data.repeat(repeats, axis=axis) return self._shallow_copy(result) + def insert(self, loc: int, item): + # ExtensionIndex subclasses must override Index.insert + raise AbstractMethodError(self) + def _concat_same_dtype(self, to_concat, name): arr = type(self._data)._concat_same_type(to_concat) return type(self)._simple_new(arr, name=name) - @Appender(Index.take.__doc__) + @doc(Index.take) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) indices = ensure_platform_int(indices) @@ -276,7 +286,7 @@ def _get_unique_index(self, dropna=False): result = result[~result.isna()] return self._shallow_copy(result) - @Appender(Index.map.__doc__) + @doc(Index.map) def map(self, mapper, na_action=None): # Try to run function on index first, and then on elements of index # Especially important for group-by functionality @@ -293,7 +303,7 @@ def map(self, mapper, na_action=None): except Exception: return self.astype(object).map(mapper) - @Appender(Index.astype.__doc__) + @doc(Index.astype) def astype(self, dtype, copy=True): if is_dtype_equal(self.dtype, dtype) and copy is False: # Ensure that self.astype(self.dtype) is self diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index a7bb4237eab69..18e995ce4efd7 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -20,7 +20,7 @@ ) from pandas.core.dtypes.common import ( ensure_platform_int, - is_categorical, + is_categorical_dtype, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_dtype_equal, @@ -39,6 +39,7 @@ from pandas.core.algorithms import take_1d from pandas.core.arrays.interval import IntervalArray, _interval_shared_docs import pandas.core.common as com +from pandas.core.indexers import is_valid_positional_slice import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( Index, @@ -242,6 +243,7 @@ def _simple_new(cls, array: IntervalArray, name: Label = None): result = IntervalMixin.__new__(cls) result._data = array result.name = name + result._cache = {} result._no_setting_name = False result._reset_identity() return result @@ -331,12 +333,15 @@ def from_tuples( # -------------------------------------------------------------------- @Appender(Index._shallow_copy.__doc__) - def _shallow_copy(self, values=None, **kwargs): + def _shallow_copy(self, values=None, name: Label = lib.no_default): + name = self.name if name is lib.no_default else name + cache = self._cache.copy() if values is None else {} if values is None: values = self._data - attributes = self._get_attributes_dict() - attributes.update(kwargs) - return self._simple_new(values, **attributes) + + result = self._simple_new(values, name=name) + result._cache = cache + return result @cache_readonly def _isnan(self): @@ -404,7 +409,7 @@ def __reduce__(self): @Appender(Index.astype.__doc__) def astype(self, dtype, copy=True): with rewrite_exception("IntervalArray", type(self).__name__): - new_values = self.values.astype(dtype, copy=copy) + new_values = self._values.astype(dtype, copy=copy) if is_interval_dtype(new_values): return self._shallow_copy(new_values) return Index.astype(self, dtype, copy=copy) @@ -421,7 +426,7 @@ def memory_usage(self, deep: bool = False) -> int: return self.left.memory_usage(deep=deep) + self.right.memory_usage(deep=deep) # IntervalTree doesn't have a is_monotonic_decreasing, so have to override - # the Index implemenation + # the Index implementation @cache_readonly def is_monotonic_decreasing(self) -> bool: """ @@ -514,12 +519,6 @@ def _should_fallback_to_positional(self): # positional in this case return self.dtype.subtype.kind in ["m", "M"] - @Appender(Index._convert_scalar_indexer.__doc__) - def _convert_scalar_indexer(self, key, kind: str): - assert kind in ["getitem", "loc"] - # never iloc, so no-op - return key - def _maybe_cast_slice_bound(self, label, side, kind): return getattr(self, side)._maybe_cast_slice_bound(label, side, kind) @@ -724,9 +723,9 @@ def get_loc( op_right = le if self.closed_right else lt try: mask = op_left(self.left, key) & op_right(key, self.right) - except TypeError: + except TypeError as err: # scalar is not comparable to II subtype --> invalid label - raise KeyError(key) + raise KeyError(key) from err matches = mask.sum() if matches == 0: @@ -788,7 +787,7 @@ def get_indexer( left_indexer = self.left.get_indexer(target_as_index.left) right_indexer = self.right.get_indexer(target_as_index.right) indexer = np.where(left_indexer == right_indexer, left_indexer, -1) - elif is_categorical(target_as_index): + elif is_categorical_dtype(target_as_index.dtype): # get an indexer for unique categories then propagate to codes via take_1d categories_indexer = self.get_indexer(target_as_index.categories) indexer = take_1d(categories_indexer, target_as_index.codes, fill_value=-1) @@ -805,9 +804,9 @@ def get_indexer( loc = self.get_loc(key) except KeyError: loc = -1 - except InvalidIndexError: + except InvalidIndexError as err: # i.e. non-scalar key - raise TypeError(key) + raise TypeError(key) from err indexer.append(loc) return ensure_platform_int(indexer) @@ -872,14 +871,23 @@ def get_indexer_for(self, target: AnyArrayLike, **kwargs) -> np.ndarray: def _convert_slice_indexer(self, key: slice, kind: str): if not (key.step is None or key.step == 1): - raise ValueError("cannot support not-default step in a slice") + # GH#31658 if label-based, we require step == 1, + # if positional, we disallow float start/stop + msg = "label-based slicing with step!=1 is not supported for IntervalIndex" + if kind == "loc": + raise ValueError(msg) + elif kind == "getitem": + if not is_valid_positional_slice(key): + # i.e. this cannot be interpreted as a positional slice + raise ValueError(msg) + return super()._convert_slice_indexer(key, kind) @Appender(Index.where.__doc__) def where(self, cond, other=None): if other is None: other = self._na_value - values = np.where(cond, self.values, other) + values = np.where(cond, self._values, other) result = IntervalArray(values) return self._shallow_copy(result) @@ -1096,9 +1104,9 @@ def func(self, other, sort=sort): # GH 19101: ensure empty results have correct dtype if result.empty: - result = result.values.astype(self.dtype.subtype) + result = result._values.astype(self.dtype.subtype) else: - result = result.values + result = result._values return type(self).from_tuples(result, closed=self.closed, name=result_name) @@ -1279,10 +1287,10 @@ def interval_range( if freq is not None and not is_number(freq): try: freq = to_offset(freq) - except ValueError: + except ValueError as err: raise ValueError( f"freq must be numeric or convertible to DateOffset, got {freq}" - ) + ) from err # verify type compatibility if not all( diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 4bd462e83a5bc..42e0d228dab09 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -18,10 +18,10 @@ from pandas._libs import algos as libalgos, index as libindex, lib from pandas._libs.hashtable import duplicated_int64 -from pandas._typing import AnyArrayLike, ArrayLike, Scalar +from pandas._typing import AnyArrayLike, Scalar from pandas.compat.numpy import function as nv from pandas.errors import PerformanceWarning, UnsortedIndexError -from pandas.util._decorators import Appender, cache_readonly +from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.cast import coerce_indexer_dtype from pandas.core.dtypes.common import ( @@ -52,6 +52,7 @@ ensure_index, ) from pandas.core.indexes.frozen import FrozenList +from pandas.core.indexes.numeric import Int64Index import pandas.core.missing as missing from pandas.core.sorting import ( get_group_index, @@ -275,6 +276,7 @@ def __new__( raise ValueError("Must pass non-zero number of levels/codes") result = object.__new__(MultiIndex) + result._cache = {} # we've already validated levels and codes, so shortcut here result._set_levels(levels, copy=copy, validate=False) @@ -563,6 +565,7 @@ def from_product(cls, iterables, sortorder=None, names=lib.no_default): if names is lib.no_default: names = [getattr(it, "name", None) for it in iterables] + # codes are all ndarrays, so cartesian_product is lossless codes = cartesian_product(codes) return MultiIndex(levels, codes, sortorder=sortorder, names=names) @@ -759,10 +762,26 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): Examples -------- - >>> idx = pd.MultiIndex.from_tuples([(1, 'one'), (1, 'two'), - (2, 'one'), (2, 'two'), - (3, 'one'), (3, 'two')], - names=['foo', 'bar']) + >>> idx = pd.MultiIndex.from_tuples( + ... [ + ... (1, "one"), + ... (1, "two"), + ... (2, "one"), + ... (2, "two"), + ... (3, "one"), + ... (3, "two") + ... ], + ... names=["foo", "bar"] + ... ) + >>> idx + MultiIndex([(1, 'one'), + (1, 'two'), + (2, 'one'), + (2, 'two'), + (3, 'one'), + (3, 'two')], + names=['foo', 'bar']) + >>> idx.set_levels([['a', 'b', 'c'], [1, 2]]) MultiIndex([('a', 1), ('a', 2), @@ -795,10 +814,12 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]) MultiIndex([('a', 1), - ('a', 2), - ('b', 1), - ('b', 2)], - names=['foo', 'bar']) + ('a', 2), + ('b', 1), + ('b', 2), + ('c', 1), + ('c', 2)], + names=['foo', 'bar']) >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]).levels FrozenList([['a', 'b', 'c'], [1, 2, 3, 4]]) """ @@ -904,11 +925,16 @@ def set_codes(self, codes, level=None, inplace=False, verify_integrity=True): Examples -------- - >>> idx = pd.MultiIndex.from_tuples([(1, 'one'), - (1, 'two'), - (2, 'one'), - (2, 'two')], - names=['foo', 'bar']) + >>> idx = pd.MultiIndex.from_tuples( + ... [(1, "one"), (1, "two"), (2, "one"), (2, "two")], names=["foo", "bar"] + ... ) + >>> idx + MultiIndex([(1, 'one'), + (1, 'two'), + (2, 'one'), + (2, 'two')], + names=['foo', 'bar']) + >>> idx.set_codes([[1, 0, 1, 0], [0, 0, 1, 1]]) MultiIndex([(2, 'one'), (1, 'one'), @@ -983,14 +1009,42 @@ def _engine(self): def _constructor(self): return MultiIndex.from_tuples - @Appender(Index._shallow_copy.__doc__) - def _shallow_copy(self, values=None, **kwargs): + @doc(Index._shallow_copy) + def _shallow_copy( + self, + values=None, + name=lib.no_default, + levels=None, + codes=None, + dtype=None, + sortorder=None, + names=lib.no_default, + _set_identity: bool = True, + ): + if names is not lib.no_default and name is not lib.no_default: + raise TypeError("Can only provide one of `names` and `name`") + elif names is lib.no_default: + names = name if name is not lib.no_default else self.names + if values is not None: - names = kwargs.pop("names", kwargs.pop("name", self.names)) - # discards freq - kwargs.pop("freq", None) - return MultiIndex.from_tuples(values, names=names, **kwargs) - return self.copy(**kwargs) + assert levels is None and codes is None and dtype is None + return MultiIndex.from_tuples(values, sortorder=sortorder, names=names) + + levels = levels if levels is not None else self.levels + codes = codes if codes is not None else self.codes + + result = MultiIndex( + levels=levels, + codes=codes, + dtype=dtype, + sortorder=sortorder, + names=names, + verify_integrity=False, + _set_identity=_set_identity, + ) + result._cache = self._cache.copy() + result._cache.pop("levels", None) # GH32669 + return result def _shallow_copy_with_infer(self, values, **kwargs): # On equal MultiIndexes the difference is empty. @@ -1047,17 +1101,13 @@ def copy( levels = deepcopy(self.levels) if codes is None: codes = deepcopy(self.codes) - else: - if levels is None: - levels = self.levels - if codes is None: - codes = self.codes - return MultiIndex( + + return self._shallow_copy( levels=levels, codes=codes, names=names, + dtype=dtype, sortorder=self.sortorder, - verify_integrity=False, _set_identity=_set_identity, ) @@ -1071,7 +1121,7 @@ def view(self, cls=None): result._id = self._id return result - @Appender(Index.__contains__.__doc__) + @doc(Index.__contains__) def __contains__(self, key: Any) -> bool: hash(key) try: @@ -1092,7 +1142,7 @@ def f(l): return any(f(l) for l in self._inferred_type_levels) - @Appender(Index.memory_usage.__doc__) + @doc(Index.memory_usage) def memory_usage(self, deep: bool = False) -> int: # we are overwriting our base class to avoid # computing .values here which could materialize @@ -1180,7 +1230,7 @@ def _format_native_types(self, na_rep="nan", **kwargs): sortorder=self.sortorder, verify_integrity=False, ) - return mi.values + return mi._values def format( self, @@ -1324,7 +1374,7 @@ def _set_names(self, names, level=None, validate=True): # -------------------------------------------------------------------- - @Appender(Index._get_grouper_for_level.__doc__) + @doc(Index._get_grouper_for_level) def _get_grouper_for_level(self, mapper, level): indexer = self.codes[level] level_index = self.levels[level] @@ -1373,9 +1423,9 @@ def _get_level_number(self, level) -> int: ) try: level = self.names.index(level) - except ValueError: + except ValueError as err: if not is_integer(level): - raise KeyError(f"Level {level} not found") + raise KeyError(f"Level {level} not found") from err elif level < 0: level += self.nlevels if level < 0: @@ -1383,13 +1433,13 @@ def _get_level_number(self, level) -> int: raise IndexError( f"Too many levels: Index has only {self.nlevels} levels, " f"{orig_level} is not a valid level number" - ) + ) from err # Note: levels are zero-based elif level >= self.nlevels: raise IndexError( f"Too many levels: Index has only {self.nlevels} levels, " f"not {level + 1}" - ) + ) from err return level @property @@ -1419,7 +1469,7 @@ def is_monotonic_increasing(self) -> bool: except TypeError: # we have mixed types and np.lexsort is not happy - return Index(self.values).is_monotonic + return Index(self._values).is_monotonic @cache_readonly def is_monotonic_decreasing(self) -> bool: @@ -1435,7 +1485,7 @@ def _inferred_type_levels(self): """ return a list of the inferred types, one for each level """ return [i.inferred_type for i in self.levels] - @Appender(Index.duplicated.__doc__) + @doc(Index.duplicated) def duplicated(self, keep="first"): shape = map(len, self.levels) ids = get_group_index(self.codes, shape, sort=False, xnull=False) @@ -1448,7 +1498,7 @@ def fillna(self, value=None, downcast=None): """ raise NotImplementedError("isna is not defined for MultiIndex") - @Appender(Index.dropna.__doc__) + @doc(Index.dropna) def dropna(self, how="any"): nans = [level_codes == -1 for level_codes in self.codes] if how == "any": @@ -1521,7 +1571,7 @@ def get_level_values(self, level): values = self._get_level_values(level) return values - @Appender(Index.unique.__doc__) + @doc(Index.unique) def unique(self, level=None): if level is None: @@ -1557,7 +1607,8 @@ def to_frame(self, index=True, name=None): See Also -------- - DataFrame + DataFrame : Two-dimensional, size-mutable, potentially heterogeneous + tabular data. """ from pandas import DataFrame @@ -1612,7 +1663,7 @@ def to_flat_index(self): ('bar', 'baz'), ('bar', 'qux')], dtype='object') """ - return Index(self.values, tupleize_cols=False) + return Index(self._values, tupleize_cols=False) @property def is_all_dates(self) -> bool: @@ -1625,6 +1676,30 @@ def is_lexsorted(self) -> bool: Returns ------- bool + + Examples + -------- + In the below examples, the first level of the MultiIndex is sorted because + a>> pd.MultiIndex.from_arrays([['a', 'b', 'c'], ['d', 'e', 'f']]).is_lexsorted() + True + >>> pd.MultiIndex.from_arrays([['a', 'b', 'c'], ['d', 'f', 'e']]).is_lexsorted() + True + + In case there is a tie, the lexicographical sorting looks + at the next level of the MultiIndex. + + >>> pd.MultiIndex.from_arrays([[0, 1, 1], ['a', 'b', 'c']]).is_lexsorted() + True + >>> pd.MultiIndex.from_arrays([[0, 1, 1], ['a', 'c', 'b']]).is_lexsorted() + False + >>> pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], + ... ['aa', 'bb', 'aa', 'bb']]).is_lexsorted() + True + >>> pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], + ... ['bb', 'aa', 'aa', 'bb']]).is_lexsorted() + False """ return self.lexsort_depth == self.nlevels @@ -1914,7 +1989,7 @@ def append(self, other): arrays.append(label.append(appended)) return MultiIndex.from_arrays(arrays, names=self.names) - to_concat = (self.values,) + tuple(k._values for k in other) + to_concat = (self._values,) + tuple(k._values for k in other) new_tuples = np.concatenate(to_concat) # if all(isinstance(x, MultiIndex) for x in other): @@ -1924,7 +1999,7 @@ def append(self, other): return Index(new_tuples) def argsort(self, *args, **kwargs) -> np.ndarray: - return self.values.argsort(*args, **kwargs) + return self._values.argsort(*args, **kwargs) @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) def repeat(self, repeats, axis=None): @@ -2213,7 +2288,7 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): # GH7774: preserve dtype/tz if target is empty and not an Index. # target may be an iterator - target = ibase._ensure_has_len(target) + target = ibase.ensure_has_len(target) if len(target) == 0 and not isinstance(target, Index): idx = self.levels[level] attrs = idx._get_attributes_dict() @@ -2258,23 +2333,21 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): # -------------------------------------------------------------------- # Indexing Methods - def get_value(self, series, key): - # Label-based + def _check_indexing_error(self, key): if not is_hashable(key) or is_iterator(key): # We allow tuples if they are hashable, whereas other Index # subclasses require scalar. # We have to explicitly exclude generators, as these are hashable. raise InvalidIndexError(key) - try: - loc = self.get_loc(key) - except KeyError: - if is_integer(key): - loc = key - else: - raise - - return self._get_values_for_loc(series, loc, key) + def _should_fallback_to_positional(self) -> bool: + """ + If an integer key is not found, should we fall back to positional indexing? + """ + if not self.nlevels: + return False + # GH#33355 + return self.levels[0]._should_fallback_to_positional() def _get_values_for_loc(self, series: "Series", loc, key): """ @@ -2368,7 +2441,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): # let's instead try with a straight Index if method is None: - return Index(self.values).get_indexer( + return Index(self._values).get_indexer( target, method=method, limit=limit, tolerance=tolerance ) @@ -2380,7 +2453,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): raise NotImplementedError( "tolerance not implemented yet for MultiIndex" ) - indexer = self._engine.get_indexer(target, method, limit) + indexer = self._engine.get_indexer( + values=self.values, target=target, method=method, limit=limit + ) elif method == "nearest": raise NotImplementedError( "method='nearest' not implemented yet " @@ -2699,8 +2774,7 @@ def get_loc_level(self, key, level=0, drop_level: bool = True): (slice(1, 3, None), Index(['e', 'f'], dtype='object', name='B')) >>> mi.get_loc_level('e', level='B') - (array([False, True, False], dtype=bool), - Index(['b'], dtype='object', name='A')) + (array([False, True, False]), Index(['b'], dtype='object', name='A')) >>> mi.get_loc_level(['b', 'e']) (1, None) @@ -2831,7 +2905,8 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): mapper = Series(indexer) indexer = codes.take(ensure_platform_int(indexer)) result = Series(Index(indexer).isin(r).nonzero()[0]) - m = result.map(mapper)._ndarray_values + m = result.map(mapper) + m = np.asarray(m) else: m = np.zeros(len(codes), dtype=bool) @@ -2949,7 +3024,7 @@ def get_locs(self, seq): n = len(self) indexer = None - def _convert_to_indexer(r): + def _convert_to_indexer(r) -> Int64Index: # return an indexer if isinstance(r, slice): m = np.zeros(n, dtype=bool) @@ -2998,7 +3073,7 @@ def _update_indexer(idxr, indexer=indexer): indexer = _update_indexer(indexers, indexer=indexer) else: # no matches we are done - return Int64Index([])._ndarray_values + return np.array([], dtype=np.int64) elif com.is_null_slice(k): # empty slice @@ -3024,15 +3099,18 @@ def _update_indexer(idxr, indexer=indexer): # empty indexer if indexer is None: - return Int64Index([])._ndarray_values + return np.array([], dtype=np.int64) + assert isinstance(indexer, Int64Index), type(indexer) indexer = self._reorder_indexer(seq, indexer) - return indexer._ndarray_values + return indexer._values def _reorder_indexer( - self, seq: Tuple[Union[Scalar, Iterable, AnyArrayLike], ...], indexer: ArrayLike - ) -> ArrayLike: + self, + seq: Tuple[Union[Scalar, Iterable, AnyArrayLike], ...], + indexer: Int64Index, + ) -> Int64Index: """ Reorder an indexer of a MultiIndex (self) so that the label are in the same order as given in seq @@ -3136,11 +3214,10 @@ def equals(self, other) -> bool: if not isinstance(other, MultiIndex): # d-level MultiIndex can equal d-tuple Index if not is_object_dtype(other.dtype): - if self.nlevels != other.nlevels: - return False + # other cannot contain tuples, so cannot match self + return False - other_vals = com.values_from_object(ensure_index(other)) - return array_equivalent(self._ndarray_values, other_vals) + return array_equivalent(self._values, other._values) if self.nlevels != other.nlevels: return False @@ -3220,7 +3297,46 @@ def union(self, other, sort=None): ------- Index - >>> index.union(index2) + Examples + -------- + >>> idx1 = pd.MultiIndex.from_arrays( + ... [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] + ... ) + >>> idx1 + MultiIndex([(1, 'Red'), + (1, 'Blue'), + (2, 'Red'), + (2, 'Blue')], + ) + >>> idx2 = pd.MultiIndex.from_arrays( + ... [[3, 3, 2, 2], ["Red", "Green", "Red", "Green"]] + ... ) + >>> idx2 + MultiIndex([(3, 'Red'), + (3, 'Green'), + (2, 'Red'), + (2, 'Green')], + ) + + >>> idx1.union(idx2) + MultiIndex([(1, 'Blue'), + (1, 'Red'), + (2, 'Blue'), + (2, 'Green'), + (2, 'Red'), + (3, 'Green'), + (3, 'Red')], + ) + + >>> idx1.union(idx2, sort=False) + MultiIndex([(1, 'Red'), + (1, 'Blue'), + (2, 'Red'), + (2, 'Blue'), + (3, 'Red'), + (3, 'Green'), + (2, 'Green')], + ) """ self._validate_sort_keyword(sort) self._assert_can_do_setop(other) @@ -3231,9 +3347,13 @@ def union(self, other, sort=None): # TODO: Index.union returns other when `len(self)` is 0. - uniq_tuples = lib.fast_unique_multiple( - [self._ndarray_values, other._ndarray_values], sort=sort - ) + if not is_object_dtype(other.dtype): + raise NotImplementedError( + "Can only union MultiIndex with MultiIndex or Index of tuples, " + "try mi.to_flat_index().union(other) instead." + ) + + uniq_tuples = lib.fast_unique_multiple([self._values, other._values], sort=sort) return MultiIndex.from_arrays( zip(*uniq_tuples), sortorder=0, names=result_names @@ -3267,10 +3387,20 @@ def intersection(self, other, sort=False): if self.equals(other): return self - lvals = self._ndarray_values - rvals = other._ndarray_values + if not is_object_dtype(other.dtype): + # The intersection is empty + # TODO: we have no tests that get here + return MultiIndex( + levels=self.levels, + codes=[[]] * self.nlevels, + names=result_names, + verify_integrity=False, + ) + + lvals = self._values + rvals = other._values - uniq_tuples = None # flag whether _inner_indexer was succesful + uniq_tuples = None # flag whether _inner_indexer was successful if self.is_monotonic and other.is_monotonic: try: uniq_tuples = self._inner_indexer(lvals, rvals)[0] @@ -3342,7 +3472,7 @@ def difference(self, other, sort=None): indexer = indexer.take((indexer != -1).nonzero()[0]) label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) - difference = this.values.take(label_diff) + difference = this._values.take(label_diff) if sort is None: difference = sorted(difference) @@ -3359,7 +3489,8 @@ def difference(self, other, sort=None): def _convert_can_do_setop(self, other): result_names = self.names - if not hasattr(other, "names"): + if not isinstance(other, Index): + if len(other) == 0: other = MultiIndex( levels=[[]] * self.nlevels, @@ -3370,15 +3501,15 @@ def _convert_can_do_setop(self, other): msg = "other must be a MultiIndex or a list of tuples" try: other = MultiIndex.from_tuples(other) - except TypeError: - raise TypeError(msg) + except TypeError as err: + raise TypeError(msg) from err else: result_names = self.names if self.names == other.names else None return other, result_names # -------------------------------------------------------------------- - @Appender(Index.astype.__doc__) + @doc(Index.astype) def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if is_categorical_dtype(dtype): @@ -3453,11 +3584,11 @@ def _wrap_joined_index(self, joined, other): names = self.names if self.names == other.names else None return MultiIndex.from_tuples(joined, names=names) - @Appender(Index.isin.__doc__) + @doc(Index.isin) def isin(self, values, level=None): if level is None: - values = MultiIndex.from_tuples(values, names=self.names).values - return algos.isin(self.values, values) + values = MultiIndex.from_tuples(values, names=self.names)._values + return algos.isin(self._values, values) else: num = self._get_level_number(level) levs = self.get_level_values(num) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 06a26cc90555e..e2be58a56018d 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -4,7 +4,7 @@ from pandas._libs import index as libindex, lib from pandas._typing import Dtype, Label -from pandas.util._decorators import Appender, cache_readonly +from pandas.util._decorators import cache_readonly, doc from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( @@ -95,24 +95,20 @@ def _validate_dtype(cls, dtype: Dtype) -> None: f"Incorrect `dtype` passed: expected {expected}, received {dtype}" ) - @Appender(Index._maybe_cast_slice_bound.__doc__) + @doc(Index._maybe_cast_slice_bound) def _maybe_cast_slice_bound(self, label, side, kind): assert kind in ["loc", "getitem", None] # we will try to coerce to integers return self._maybe_cast_indexer(label) - @Appender(Index._shallow_copy.__doc__) + @doc(Index._shallow_copy) def _shallow_copy(self, values=None, name: Label = lib.no_default): - name = name if name is not lib.no_default else self.name - if values is not None and not self._can_hold_na and values.dtype.kind == "f": + name = self.name if name is lib.no_default else name # Ensure we are not returning an Int64Index with float data: return Float64Index._simple_new(values, name=name) - - if values is None: - values = self.values - return type(self)._simple_new(values, name=name) + return super()._shallow_copy(values=values, name=name) def _convert_for_op(self, value): """ @@ -162,7 +158,7 @@ def is_all_dates(self) -> bool: """ return False - @Appender(Index.insert.__doc__) + @doc(Index.insert) def insert(self, loc: int, item): # treat NA values as nans: if is_scalar(item) and isna(item): @@ -252,15 +248,7 @@ def inferred_type(self) -> str: @property def asi8(self) -> np.ndarray: # do not cache or you'll create a memory leak - return self.values.view(self._default_dtype) - - @Appender(Index._convert_scalar_indexer.__doc__) - def _convert_scalar_indexer(self, key, kind: str): - assert kind in ["loc", "getitem"] - - # never iloc, which we don't coerce to integers - key = self._maybe_cast_indexer(key) - return super()._convert_scalar_indexer(key, kind=kind) + return self._values.view(self._default_dtype) class Int64Index(IntegerIndex): @@ -307,7 +295,7 @@ class UInt64Index(IntegerIndex): _engine_type = libindex.UInt64Engine _default_dtype = np.dtype(np.uint64) - @Appender(Index._convert_arr_indexer.__doc__) + @doc(Index._convert_arr_indexer) def _convert_arr_indexer(self, keyarr): # Cast the indexer to uint64 if possible so that the values returned # from indexing are also uint64. @@ -319,7 +307,7 @@ def _convert_arr_indexer(self, keyarr): return com.asarray_tuplesafe(keyarr, dtype=dtype) - @Appender(Index._convert_index_indexer.__doc__) + @doc(Index._convert_index_indexer) def _convert_index_indexer(self, keyarr): # Cast the indexer to uint64 if possible so # that the values returned from indexing are @@ -369,7 +357,7 @@ def inferred_type(self) -> str: """ return "floating" - @Appender(Index.astype.__doc__) + @doc(Index.astype) def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if needs_i8_conversion(dtype): @@ -380,24 +368,18 @@ def astype(self, dtype, copy=True): elif is_integer_dtype(dtype) and not is_extension_array_dtype(dtype): # TODO(jreback); this can change once we have an EA Index type # GH 13149 - arr = astype_nansafe(self.values, dtype=dtype) - return Int64Index(arr) + arr = astype_nansafe(self._values, dtype=dtype) + return Int64Index(arr, name=self.name) return super().astype(dtype, copy=copy) # ---------------------------------------------------------------- # Indexing Methods - @Appender(Index._should_fallback_to_positional.__doc__) + @doc(Index._should_fallback_to_positional) def _should_fallback_to_positional(self): return False - @Appender(Index._convert_scalar_indexer.__doc__) - def _convert_scalar_indexer(self, key, kind: str): - assert kind in ["loc", "getitem"] - # no-op for non-iloc - return key - - @Appender(Index._convert_slice_indexer.__doc__) + @doc(Index._convert_slice_indexer) def _convert_slice_indexer(self, key: slice, kind: str): assert kind in ["loc", "getitem"] @@ -413,7 +395,7 @@ def _format_native_types( from pandas.io.formats.format import FloatArrayFormatter formatter = FloatArrayFormatter( - self.values, + self._values, na_rep=na_rep, float_format=float_format, decimal=decimal, @@ -439,7 +421,7 @@ def equals(self, other) -> bool: other = self._constructor(other) if not is_dtype_equal(self.dtype, other.dtype) or self.shape != other.shape: return False - left, right = self._ndarray_values, other._ndarray_values + left, right = self._values, other._values return ((left == right) | (self._isnan & other._isnan)).all() except (TypeError, ValueError): return False @@ -451,7 +433,7 @@ def __contains__(self, other: Any) -> bool: return is_float(other) and np.isnan(other) and self.hasnans - @Appender(Index.get_loc.__doc__) + @doc(Index.get_loc) def get_loc(self, key, method=None, tolerance=None): if is_bool(key): # Catch this to avoid accidentally casting to 1.0 @@ -471,7 +453,7 @@ def get_loc(self, key, method=None, tolerance=None): def is_unique(self) -> bool: return super().is_unique and self._nan_idxs.size < 2 - @Appender(Index.isin.__doc__) + @doc(Index.isin) def isin(self, values, level=None): if level is not None: self._validate_index_level(level) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 35a5d99abf4e6..1f565828ec7a5 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -9,8 +9,8 @@ from pandas._libs.tslibs import frequencies as libfrequencies, resolution from pandas._libs.tslibs.parsing import parse_time_string from pandas._libs.tslibs.period import Period -from pandas._typing import Label -from pandas.util._decorators import Appender, cache_readonly +from pandas._typing import DtypeObj, Label +from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.common import ( ensure_platform_int, @@ -19,11 +19,11 @@ is_dtype_equal, is_float, is_integer, - is_integer_dtype, is_object_dtype, is_scalar, pandas_dtype, ) +from pandas.core.dtypes.dtypes import PeriodDtype from pandas.core.arrays.period import ( PeriodArray, @@ -138,7 +138,9 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index): Examples -------- - >>> idx = pd.PeriodIndex(year=year_arr, quarter=q_arr) + >>> idx = pd.PeriodIndex(year=[2000, 2002], quarter=[1, 3]) + >>> idx + PeriodIndex(['2000Q1', '2002Q3'], dtype='period[Q-DEC]', freq='Q-DEC') """ _typ = "periodindex" @@ -149,6 +151,7 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index): _infer_as_myclass = True _data: PeriodArray + freq: DateOffset _engine_type = libindex.PeriodEngine _supports_partial_string_indexing = True @@ -234,6 +237,7 @@ def _simple_new(cls, values: PeriodArray, name: Label = None): # For groupby perf. See note in indexes/base about _index_data result._index_data = values._data result.name = name + result._cache = {} result._reset_identity() return result @@ -250,23 +254,14 @@ def _has_complex_internals(self): return True def _shallow_copy(self, values=None, name: Label = no_default): - # TODO: simplify, figure out type of values name = name if name is not no_default else self.name - + cache = self._cache.copy() if values is None else {} if values is None: values = self._data - if isinstance(values, type(self)): - values = values._data - - if not isinstance(values, PeriodArray): - if isinstance(values, np.ndarray) and values.dtype == "i8": - values = PeriodArray(values, freq=self.freq) - else: - # GH#30713 this should never be reached - raise TypeError(type(values), getattr(values, "dtype", None)) - - return self._simple_new(values, name=name) + result = self._simple_new(values, name=name) + result._cache = cache + return result def _maybe_convert_timedelta(self, other): """ @@ -307,12 +302,20 @@ def _maybe_convert_timedelta(self, other): # raise when input doesn't have freq raise raise_on_incompatible(self, None) + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + """ + Can we compare values of the given dtype to our own? + """ + if not isinstance(dtype, PeriodDtype): + return False + return dtype.freq == self.freq + # ------------------------------------------------------------------------ # Rendering Methods def _mpl_repr(self): # how to represent ourselves to matplotlib - return self.astype(object).values + return self.astype(object)._values @property def _formatter_func(self): @@ -323,11 +326,11 @@ def _formatter_func(self): @cache_readonly def _engine(self): - # To avoid a reference cycle, pass a weakref of self to _engine_type. - period = weakref.ref(self) + # To avoid a reference cycle, pass a weakref of self._values to _engine_type. + period = weakref.ref(self._values) return self._engine_type(period, len(self)) - @Appender(Index.__contains__.__doc__) + @doc(Index.__contains__) def __contains__(self, key: Any) -> bool: if isinstance(key, Period): if key.freq != self.freq: @@ -349,12 +352,6 @@ def _int64index(self) -> Int64Index: # ------------------------------------------------------------------------ # Index Methods - def __array__(self, dtype=None) -> np.ndarray: - if is_integer_dtype(dtype): - return self.asi8 - else: - return self.astype(object).values - def __array_wrap__(self, result, context=None): """ Gets called after a ufunc. Needs additional handling as @@ -388,31 +385,30 @@ def __array_wrap__(self, result, context=None): # cannot pass _simple_new as it is return type(self)(result, freq=self.freq, name=self.name) - def asof_locs(self, where, mask): + def asof_locs(self, where, mask: np.ndarray) -> np.ndarray: """ where : array of timestamps mask : array of booleans where data is not NA - """ where_idx = where if isinstance(where_idx, DatetimeIndex): - where_idx = PeriodIndex(where_idx.values, freq=self.freq) + where_idx = PeriodIndex(where_idx._values, freq=self.freq) + elif not isinstance(where_idx, PeriodIndex): + raise TypeError("asof_locs `where` must be DatetimeIndex or PeriodIndex") + elif where_idx.freq != self.freq: + raise raise_on_incompatible(self, where_idx) - locs = self._ndarray_values[mask].searchsorted( - where_idx._ndarray_values, side="right" - ) + locs = self.asi8[mask].searchsorted(where_idx.asi8, side="right") locs = np.where(locs > 0, locs - 1, 0) result = np.arange(len(self))[mask].take(locs) first = mask.argmax() - result[ - (locs == 0) & (where_idx._ndarray_values < self._ndarray_values[first]) - ] = -1 + result[(locs == 0) & (where_idx.asi8 < self.asi8[first])] = -1 return result - @Appender(Index.astype.__doc__) + @doc(Index.astype) def astype(self, dtype, copy=True, how="start"): dtype = pandas_dtype(dtype) @@ -470,12 +466,11 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): def get_indexer_non_unique(self, target): target = ensure_index(target) - if isinstance(target, PeriodIndex): - if target.freq != self.freq: - no_matches = -1 * np.ones(self.shape, dtype=np.intp) - return no_matches, no_matches + if not self._is_comparable_dtype(target.dtype): + no_matches = -1 * np.ones(self.shape, dtype=np.intp) + return no_matches, no_matches - target = target.asi8 + target = target.asi8 indexer, missing = self._int64index.get_indexer_non_unique(target) return ensure_platform_int(indexer), missing @@ -515,9 +510,9 @@ def get_loc(self, key, method=None, tolerance=None): try: asdt, reso = parse_time_string(key, self.freq) - except DateParseError: + except DateParseError as err: # A string with invalid format - raise KeyError(f"Cannot interpret '{key}' as period") + raise KeyError(f"Cannot interpret '{key}' as period") from err grp = resolution.Resolution.get_freq_group(reso) freqn = resolution.get_freq_group(self.freq) @@ -540,14 +535,14 @@ def get_loc(self, key, method=None, tolerance=None): try: key = Period(key, freq=self.freq) - except ValueError: + except ValueError as err: # we cannot construct the Period - raise KeyError(orig_key) + raise KeyError(orig_key) from err try: return Index.get_loc(self, key, method, tolerance) - except KeyError: - raise KeyError(orig_key) + except KeyError as err: + raise KeyError(orig_key) from err def _maybe_cast_slice_bound(self, label, side: str, kind: str): """ @@ -578,10 +573,10 @@ def _maybe_cast_slice_bound(self, label, side: str, kind: str): parsed, reso = parse_time_string(label, self.freq) bounds = self._parsed_string_to_bounds(reso, parsed) return bounds[0 if side == "left" else 1] - except ValueError: + except ValueError as err: # string cannot be parsed as datetime-like # TODO: we need tests for this case - raise KeyError(label) + raise KeyError(label) from err elif is_integer(label) or is_float(label): self._invalid_indexer("slice", label) @@ -611,17 +606,18 @@ def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True try: return self._partial_date_slice(reso, parsed, use_lhs, use_rhs) - except KeyError: - raise KeyError(key) + except KeyError as err: + raise KeyError(key) from err def insert(self, loc, item): if not isinstance(item, Period) or self.freq != item.freq: return self.astype(object).insert(loc, item) - idx = np.concatenate( + i8result = np.concatenate( (self[:loc].asi8, np.array([item.ordinal]), self[loc:].asi8) ) - return self._shallow_copy(idx) + arr = type(self._data)._simple_new(i8result, dtype=self.dtype) + return type(self)._simple_new(arr, name=self.name) def join(self, other, how="left", level=None, return_indexers=False, sort=False): """ @@ -781,10 +777,10 @@ def period_range( Examples -------- >>> pd.period_range(start='2017-01-01', end='2018-01-01', freq='M') - PeriodIndex(['2017-01', '2017-02', '2017-03', '2017-04', '2017-05', - '2017-06', '2017-06', '2017-07', '2017-08', '2017-09', - '2017-10', '2017-11', '2017-12', '2018-01'], - dtype='period[M]', freq='M') + PeriodIndex(['2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06', + '2017-07', '2017-08', '2017-09', '2017-10', '2017-11', '2017-12', + '2018-01'], + dtype='period[M]', freq='M') If ``start`` or ``end`` are ``Period`` objects, they will be used as anchor endpoints for a ``PeriodIndex`` with frequency matching that of the diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 71cc62e6a110b..b463b8d738d30 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -11,7 +11,7 @@ from pandas._typing import Label import pandas.compat as compat from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender, cache_readonly +from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.common import ( ensure_platform_int, @@ -141,7 +141,7 @@ def _simple_new(cls, values: range, name: Label = None) -> "RangeIndex": result._range = values result.name = name - + result._cache = {} result._reset_identity() return result @@ -168,7 +168,7 @@ def _data(self): return self._cached_data @cache_readonly - def _int64index(self): + def _int64index(self) -> Int64Index: return Int64Index._simple_new(self._data, name=self.name) def _get_data_as_items(self): @@ -342,15 +342,15 @@ def __contains__(self, key: Any) -> bool: return False return key in self._range - @Appender(Int64Index.get_loc.__doc__) + @doc(Int64Index.get_loc) def get_loc(self, key, method=None, tolerance=None): if method is None and tolerance is None: if is_integer(key) or (is_float(key) and key.is_integer()): new_key = int(key) try: return self._range.index(new_key) - except ValueError: - raise KeyError(key) + except ValueError as err: + raise KeyError(key) from err raise KeyError(key) return super().get_loc(key, method=method, tolerance=tolerance) @@ -386,16 +386,18 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): def tolist(self): return list(self._range) - @Appender(Int64Index._shallow_copy.__doc__) + @doc(Int64Index._shallow_copy) def _shallow_copy(self, values=None, name: Label = no_default): name = self.name if name is no_default else name if values is None: - return self._simple_new(self._range, name=name) + result = self._simple_new(self._range, name=name) + result._cache = self._cache.copy() + return result else: return Int64Index._simple_new(values, name=name) - @Appender(Int64Index.copy.__doc__) + @doc(Int64Index.copy) def copy(self, name=None, deep=False, dtype=None, **kwargs): self._validate_dtype(dtype) if name is None: @@ -617,7 +619,7 @@ def _union(self, other, sort): return type(self)(start_r, end_r + step_o, step_o) return self._int64index._union(other, sort=sort) - @Appender(Int64Index.join.__doc__) + @doc(Int64Index.join) def join(self, other, how="left", level=None, return_indexers=False, sort=False): if how == "outer" and self is not other: # note: could return RangeIndex in more circumstances @@ -695,10 +697,10 @@ def __getitem__(self, key): new_key = int(key) try: return self._range[new_key] - except IndexError: + except IndexError as err: raise IndexError( f"index {key} is out of bounds for axis 0 with size {len(self)}" - ) + ) from err elif is_scalar(key): raise IndexError( "only integers, slices (`:`), " diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index b3b2bc46f6659..765b948f13e96 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -1,10 +1,11 @@ """ implement the TimedeltaIndex """ from pandas._libs import NaT, Timedelta, index as libindex -from pandas.util._decorators import Appender +from pandas._typing import DtypeObj, Label +from pandas.util._decorators import doc from pandas.core.dtypes.common import ( - _TD_DTYPE, + TD64NS_DTYPE, is_float, is_integer, is_scalar, @@ -133,7 +134,7 @@ def __new__( unit=None, freq=None, closed=None, - dtype=_TD_DTYPE, + dtype=TD64NS_DTYPE, copy=False, name=None, ): @@ -154,7 +155,7 @@ def __new__( if isinstance(data, TimedeltaArray) and freq is None: if copy: data = data.copy() - return cls._simple_new(data, name=name, freq=freq) + return cls._simple_new(data, name=name) if isinstance(data, TimedeltaIndex) and freq is None and name is None: if copy: @@ -170,16 +171,13 @@ def __new__( return cls._simple_new(tdarr, name=name) @classmethod - def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE): - # `dtype` is passed by _shallow_copy in corner cases, should always - # be timedelta64[ns] if present - assert dtype == _TD_DTYPE, dtype + def _simple_new(cls, values: TimedeltaArray, name: Label = None): assert isinstance(values, TimedeltaArray) - assert freq is None or values.freq == freq result = object.__new__(cls) result._data = values result._name = name + result._cache = {} # For groupby perf. See note in indexes/base about _index_data result._index_data = values._data @@ -197,7 +195,7 @@ def _formatter_func(self): # ------------------------------------------------------------------- - @Appender(Index.astype.__doc__) + @doc(Index.astype) def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if is_timedelta64_dtype(dtype) and not is_timedelta64_ns_dtype(dtype): @@ -215,6 +213,12 @@ def _maybe_promote(self, other): other = TimedeltaIndex(other) return self, other + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + """ + Can we compare values of the given dtype to our own? + """ + return is_timedelta64_dtype(dtype) + def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label @@ -232,8 +236,8 @@ def get_loc(self, key, method=None, tolerance=None): elif isinstance(key, str): try: key = Timedelta(key) - except ValueError: - raise KeyError(key) + except ValueError as err: + raise KeyError(key) from err elif isinstance(key, self._data._recognized_scalars) or key is NaT: key = Timedelta(key) @@ -347,7 +351,7 @@ def timedelta_range( >>> pd.timedelta_range(start='1 day', end='5 days', periods=4) TimedeltaIndex(['1 days 00:00:00', '2 days 08:00:00', '3 days 16:00:00', '5 days 00:00:00'], - dtype='timedelta64[ns]', freq=None) + dtype='timedelta64[ns]', freq='32H') """ if freq is None and com.any_none(periods, start, end): freq = "D" diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py old mode 100755 new mode 100644 index 5adc65b488399..b74399ed86fbd --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1,14 +1,14 @@ -from typing import Hashable, List, Tuple, Union +from typing import TYPE_CHECKING, Hashable, List, Tuple, Union import numpy as np from pandas._libs.indexing import _NDFrameIndexerBase from pandas._libs.lib import item_from_zerodim from pandas.errors import AbstractMethodError -from pandas.util._decorators import Appender +from pandas.util._decorators import doc from pandas.core.dtypes.common import ( - is_float, + is_hashable, is_integer, is_iterator, is_list_like, @@ -29,6 +29,9 @@ ) from pandas.core.indexes.api import Index, InvalidIndexError +if TYPE_CHECKING: + from pandas import DataFrame # noqa:F401 + # "null slice" _NS = slice(None, None) @@ -582,6 +585,9 @@ def _get_setitem_indexer(self, key): """ Convert a potentially-label-based key into a positional indexer. """ + if self.name == "loc": + self._ensure_listlike_indexer(key) + if self.axis is not None: return self._convert_tuple(key, is_setter=True) @@ -610,7 +616,41 @@ def _get_setitem_indexer(self, key): # invalid indexer type vs 'other' indexing errors if "cannot do" in str(e): raise - raise IndexingError(key) + raise IndexingError(key) from e + + def _ensure_listlike_indexer(self, key, axis=None): + """ + Ensure that a list-like of column labels are all present by adding them if + they do not already exist. + + Parameters + ---------- + key : list-like of column labels + Target labels. + axis : key axis if known + """ + column_axis = 1 + + # column only exists in 2-dimensional DataFrame + if self.ndim != 2: + return + + if isinstance(key, tuple): + # key may be a tuple if we are .loc + # in that case, set key to the column part of key + key = key[column_axis] + axis = column_axis + + if ( + axis == column_axis + and not isinstance(self.obj.columns, ABCMultiIndex) + and is_list_like_indexer(key) + and not com.is_bool_indexer(key) + and all(is_hashable(k) for k in key) + ): + for k in key: + if k not in self.obj: + self.obj[k] = np.nan def __setitem__(self, key, value): if isinstance(key, tuple): @@ -654,11 +694,11 @@ def _has_valid_tuple(self, key: Tuple): raise IndexingError("Too many indexers") try: self._validate_key(k, i) - except ValueError: + except ValueError as err: raise ValueError( "Location based indexing can only have " f"[{self._valid_types}] types" - ) + ) from err def _is_nested_tuple_indexer(self, tup: Tuple) -> bool: """ @@ -787,7 +827,7 @@ def _getitem_nested_tuple(self, tup: Tuple): # this is iterative obj = self.obj axis = 0 - for i, key in enumerate(tup): + for key in tup: if com.is_null_slice(key): axis += 1 @@ -848,7 +888,7 @@ def _getbool_axis(self, key, axis: int): return self.obj._take_with_is_copy(inds, axis=axis) -@Appender(IndexingMixin.loc.__doc__) +@doc(IndexingMixin.loc) class _LocIndexer(_LocationIndexer): _takeable: bool = False _valid_types = ( @@ -860,23 +900,14 @@ class _LocIndexer(_LocationIndexer): # ------------------------------------------------------------------- # Key Checks - @Appender(_LocationIndexer._validate_key.__doc__) + @doc(_LocationIndexer._validate_key) def _validate_key(self, key, axis: int): # valid for a collection of labels (we check their presence later) # slice of labels (where start-end in labels) # slice of integers (only if in the labels) # boolean - - if isinstance(key, slice): - return - - if com.is_bool_indexer(key): - return - - if not is_list_like_indexer(key): - labels = self.obj._get_axis(axis) - labels._convert_scalar_indexer(key, kind="loc") + pass def _has_valid_setitem_indexer(self, indexer) -> bool: return True @@ -1140,15 +1171,6 @@ def _convert_to_indexer(self, key, axis: int, is_setter: bool = False): if isinstance(key, slice): return labels._convert_slice_indexer(key, kind="loc") - if is_scalar(key): - # try to find out correct indexer, if not type correct raise - try: - key = labels._convert_scalar_indexer(key, kind="loc") - except TypeError: - # but we will allow setting - if not is_setter: - raise - # see if we are positional in nature is_int_index = labels.is_integer() is_int_positional = is_integer(key) and not is_int_index @@ -1308,7 +1330,7 @@ def _validate_read_indexer( ) -@Appender(IndexingMixin.iloc.__doc__) +@doc(IndexingMixin.iloc) class _iLocIndexer(_LocationIndexer): _valid_types = ( "integer, integer slice (START point is INCLUDED, END " @@ -1398,7 +1420,7 @@ def _is_scalar_access(self, key: Tuple) -> bool: if len(key) != self.ndim: return False - for i, k in enumerate(key): + for k in key: if not is_integer(k): return False @@ -1455,9 +1477,9 @@ def _get_list_axis(self, key, axis: int): """ try: return self.obj._take_with_is_copy(key, axis=axis) - except IndexError: + except IndexError as err: # re-raise with different error message - raise IndexError("positional indexers are out-of-bounds") + raise IndexError("positional indexers are out-of-bounds") from err def _getitem_axis(self, key, axis: int): if isinstance(key, slice): @@ -1500,18 +1522,10 @@ def _convert_to_indexer(self, key, axis: int, is_setter: bool = False): """ Much simpler as we only have to deal with our valid types. """ - labels = self.obj._get_axis(axis) - - # make need to convert a float key - if isinstance(key, slice): - labels._validate_positional_slice(key) - return key - - elif is_float(key): - # _validate_indexer call will always raise - labels._validate_indexer("positional", key, "iloc") + return key - self._validate_key(key, axis) + def _get_setitem_indexer(self, key): + # GH#32257 Fall through to let numnpy do validation return key # ------------------------------------------------------------------- @@ -1539,8 +1553,8 @@ def _setitem_with_indexer(self, indexer, value): # if there is only one block/type, still have to take split path # unless the block is one-dimensional or it can hold the value - if not take_split_path and self.obj._data.blocks: - (blk,) = self.obj._data.blocks + if not take_split_path and self.obj._mgr.blocks: + (blk,) = self.obj._mgr.blocks if 1 < blk.ndim: # in case of dict, keys are indices val = list(value.values()) if isinstance(value, dict) else value take_split_path = not blk._can_hold_element(val) @@ -1604,7 +1618,7 @@ def _setitem_with_indexer(self, indexer, value): # so the object is the same index = self.obj._get_axis(i) labels = index.insert(len(index), key) - self.obj._data = self.obj.reindex(labels, axis=i)._data + self.obj._mgr = self.obj.reindex(labels, axis=i)._mgr self.obj._maybe_update_cacher(clear=True) self.obj._is_copy = None @@ -1642,14 +1656,18 @@ def _setitem_with_indexer(self, indexer, value): info_idx = [info_idx] labels = item_labels[info_idx] + # Ensure we have something we can iterate over + ilocs = info_idx + if isinstance(info_idx, slice): + ri = Index(range(len(self.obj.columns))) + ilocs = ri[info_idx] + + plane_indexer = indexer[:1] + lplane_indexer = length_of_indexer(plane_indexer[0], self.obj.index) + # lplane_indexer gives the expected length of obj[indexer[0]] + if len(labels) == 1: # We can operate on a single column - item = labels[0] - idx = indexer[0] - - plane_indexer = tuple([idx]) - lplane_indexer = length_of_indexer(plane_indexer[0], self.obj.index) - # lplane_indexer gives the expected length of obj[idx] # require that we are setting the right number of values that # we are indexing @@ -1661,14 +1679,11 @@ def _setitem_with_indexer(self, indexer, value): "length than the value" ) - # non-mi - else: - plane_indexer = indexer[:1] - lplane_indexer = length_of_indexer(plane_indexer[0], self.obj.index) + pi = plane_indexer[0] if lplane_indexer == 1 else plane_indexer - def setter(item, v): - ser = self.obj[item] - pi = plane_indexer[0] if lplane_indexer == 1 else plane_indexer + def isetter(loc, v): + # positional setting on column loc + ser = self.obj._ixs(loc, axis=1) # perform the equivalent of a setitem on the info axis # as we have a null slice or a slice with full bounds @@ -1684,11 +1699,11 @@ def setter(item, v): # set the item, possibly having a dtype change ser._consolidate_inplace() ser = ser.copy() - ser._data = ser._data.setitem(indexer=pi, value=v) + ser._mgr = ser._mgr.setitem(indexer=pi, value=v) ser._maybe_update_cacher(clear=True) # reset the sliced object if unique - self.obj[item] = ser + self.obj._iset_item(loc, ser) # we need an iterable, with a ndim of at least 1 # eg. don't pass through np.array(0) @@ -1698,8 +1713,10 @@ def setter(item, v): if isinstance(value, ABCDataFrame): sub_indexer = list(indexer) multiindex_indexer = isinstance(labels, ABCMultiIndex) + # TODO: we are implicitly assuming value.columns is unique - for item in labels: + for loc in ilocs: + item = item_labels[loc] if item in value: sub_indexer[info_axis] = item v = self._align_series( @@ -1708,7 +1725,7 @@ def setter(item, v): else: v = np.nan - setter(item, v) + isetter(loc, v) # we have an equal len ndarray/convertible to our labels # hasattr first, to avoid coercing to ndarray without reason. @@ -1719,43 +1736,47 @@ def setter(item, v): # note that this coerces the dtype if we are mixed # GH 7551 value = np.array(value, dtype=object) - if len(labels) != value.shape[1]: + if len(ilocs) != value.shape[1]: raise ValueError( "Must have equal len keys and value " "when setting with an ndarray" ) - for i, item in enumerate(labels): + for i, loc in enumerate(ilocs): + # setting with a list, re-coerces + isetter(loc, value[:, i].tolist()) - # setting with a list, recoerces - setter(item, value[:, i].tolist()) - - # we have an equal len list/ndarray - elif _can_do_equal_len( - labels, value, plane_indexer, lplane_indexer, self.obj + elif ( + len(labels) == 1 + and lplane_indexer == len(value) + and not is_scalar(plane_indexer[0]) ): - setter(labels[0], value) + # we have an equal len list/ndarray + # We only get here with len(labels) == len(ilocs) == 1 + isetter(ilocs[0], value) - # per label values - else: + elif lplane_indexer == 0 and len(value) == len(self.obj.index): + # We get here in one case via .loc with a all-False mask + pass - if len(labels) != len(value): + else: + # per-label values + if len(ilocs) != len(value): raise ValueError( "Must have equal len keys and value " "when setting with an iterable" ) - for item, v in zip(labels, value): - setter(item, v) + for loc, v in zip(ilocs, value): + isetter(loc, v) else: - # scalar - for item in labels: - setter(item, value) + # scalar value + for loc in ilocs: + isetter(loc, value) else: if isinstance(indexer, tuple): - indexer = maybe_convert_ix(*indexer) # if we are setting on the info axis ONLY # set using those methods to avoid block-splitting @@ -1773,6 +1794,8 @@ def setter(item, v): self.obj[item_labels[indexer[info_axis]]] = value return + indexer = maybe_convert_ix(*indexer) + if isinstance(value, (ABCSeries, dict)): # TODO(EA): ExtensionBlock.setitem this causes issues with # setting for extensionarrays that store dicts. Need to decide @@ -1787,7 +1810,7 @@ def setter(item, v): # actually do the set self.obj._consolidate_inplace() - self.obj._data = self.obj._data.setitem(indexer=indexer, value=value) + self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value) self.obj._maybe_update_cacher(clear=True) def _setitem_with_indexer_missing(self, indexer, value): @@ -1819,9 +1842,9 @@ def _setitem_with_indexer_missing(self, indexer, value): # GH#22717 handle casting compatibility that np.concatenate # does incorrectly new_values = concat_compat([self.obj._values, new_values]) - self.obj._data = self.obj._constructor( + self.obj._mgr = self.obj._constructor( new_values, index=new_index, name=self.obj.name - )._data + )._mgr self.obj._maybe_update_cacher(clear=True) elif self.ndim == 2: @@ -1844,7 +1867,7 @@ def _setitem_with_indexer_missing(self, indexer, value): value = Series(value, index=self.obj.columns, name=indexer) - self.obj._data = self.obj.append(value)._data + self.obj._mgr = self.obj.append(value)._mgr self.obj._maybe_update_cacher(clear=True) def _align_series(self, indexer, ser: ABCSeries, multiindex_indexer: bool = False): @@ -2026,7 +2049,7 @@ def __setitem__(self, key, value): self.obj._set_value(*key, value=value, takeable=self._takeable) -@Appender(IndexingMixin.at.__doc__) +@doc(IndexingMixin.at) class _AtIndexer(_ScalarAccessIndexer): _takeable = False @@ -2039,14 +2062,20 @@ def _convert_key(self, key, is_setter: bool = False): if is_setter: return list(key) - lkey = list(key) - for n, (ax, i) in enumerate(zip(self.obj.axes, key)): - lkey[n] = ax._convert_scalar_indexer(i, kind="loc") + return key + + def __getitem__(self, key): + if self.ndim != 1 or not is_scalar(key): + # FIXME: is_scalar check is a kludge + return super().__getitem__(key) - return tuple(lkey) + # Like Index.get_value, but we do not allow positional fallback + obj = self.obj + loc = obj.index.get_loc(key) + return obj.index._get_values_for_loc(obj, loc, key) -@Appender(IndexingMixin.iat.__doc__) +@doc(IndexingMixin.iat) class _iAtIndexer(_ScalarAccessIndexer): _takeable = True @@ -2080,7 +2109,7 @@ def _tuplify(ndim: int, loc: Hashable) -> Tuple[Union[Hashable, slice], ...]: return tuple(_tup) -def convert_to_index_sliceable(obj, key): +def convert_to_index_sliceable(obj: "DataFrame", key): """ If we are index sliceable, then return my slicer, otherwise return None. """ @@ -2091,7 +2120,7 @@ def convert_to_index_sliceable(obj, key): elif isinstance(key, str): # we are an actual column - if key in obj._data.items: + if key in obj.columns: return None # We might have a datetimelike string that we can translate to a @@ -2205,8 +2234,7 @@ def is_nested_tuple(tup, labels) -> bool: if not isinstance(tup, tuple): return False - for i, k in enumerate(tup): - + for k in tup: if is_list_like(k) or isinstance(k, slice): return isinstance(labels, ABCMultiIndex) @@ -2286,26 +2314,3 @@ def _maybe_numeric_slice(df, slice_, include_bool=False): dtypes.append(bool) slice_ = IndexSlice[:, df.select_dtypes(include=dtypes).columns] return slice_ - - -def _can_do_equal_len(labels, value, plane_indexer, lplane_indexer, obj) -> bool: - """ - Returns - ------- - bool - True if we have an equal len settable. - """ - if not len(labels) == 1 or not np.iterable(value) or is_scalar(plane_indexer[0]): - return False - - item = labels[0] - index = obj[item].index - - values_len = len(value) - # equal len list/ndarray - if len(index) == values_len: - return True - elif lplane_indexer == values_len: - return True - - return False diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 37a3405554745..7f06fb3a7788c 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -10,18 +10,11 @@ IntBlock, ObjectBlock, TimeDeltaBlock, - _block_shape, _safe_reshape, make_block, ) -from pandas.core.internals.managers import ( - BlockManager, - SingleBlockManager, - _transform_index, - concatenate_block_managers, - create_block_manager_from_arrays, - create_block_manager_from_blocks, -) +from pandas.core.internals.concat import concatenate_block_managers +from pandas.core.internals.managers import BlockManager, SingleBlockManager __all__ = [ "Block", @@ -37,11 +30,7 @@ "TimeDeltaBlock", "_safe_reshape", "make_block", - "_block_shape", "BlockManager", "SingleBlockManager", - "_transform_index", "concatenate_block_managers", - "create_block_manager_from_arrays", - "create_block_manager_from_blocks", ] diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 34fa4c0e6544e..185b0f4da2627 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1,5 +1,4 @@ from datetime import datetime, timedelta -import functools import inspect import re from typing import Any, List @@ -7,10 +6,11 @@ import numpy as np -from pandas._libs import NaT, Timestamp, algos as libalgos, lib, tslib, writers +from pandas._libs import NaT, algos as libalgos, lib, writers import pandas._libs.internals as libinternals -from pandas._libs.tslibs import Timedelta, conversion +from pandas._libs.tslibs import conversion from pandas._libs.tslibs.timezones import tz_compare +from pandas._typing import ArrayLike from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -27,11 +27,9 @@ soft_convert_objects, ) from pandas.core.dtypes.common import ( - _NS_DTYPE, - _TD_DTYPE, - ensure_platform_int, + DT64NS_DTYPE, + TD64NS_DTYPE, is_bool_dtype, - is_categorical, is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, @@ -51,10 +49,11 @@ pandas_dtype, ) from pandas.core.dtypes.concat import concat_categorical, concat_datetime -from pandas.core.dtypes.dtypes import CategoricalDtype, ExtensionDtype +from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCExtensionArray, + ABCIndexClass, ABCPandasArray, ABCSeries, ) @@ -66,6 +65,7 @@ ) import pandas.core.algorithms as algos +from pandas.core.array_algos.transforms import shift from pandas.core.arrays import ( Categorical, DatetimeArray, @@ -110,7 +110,6 @@ class Block(PandasObject): _can_consolidate = True _verify_integrity = True _validate_ndim = True - _ftype = "dense" _concatenator = staticmethod(np.concatenate) def __init__(self, values, placement, ndim=None): @@ -170,34 +169,19 @@ def _consolidate_key(self): return (self._can_consolidate, self.dtype.name) @property - def _is_single_block(self): + def _is_single_block(self) -> bool: return self.ndim == 1 @property - def is_view(self): + def is_view(self) -> bool: """ return a boolean if I am possibly a view """ return self.values.base is not None @property - def is_datelike(self): + def is_datelike(self) -> bool: """ return True if I am a non-datelike """ return self.is_datetime or self.is_timedelta - def is_categorical_astype(self, dtype): - """ - validate that we have a astypeable to categorical, - returns a boolean if we are a categorical - """ - if dtype is Categorical or dtype is CategoricalDtype: - # this is a pd.Categorical, but is not - # a valid type for astypeing - raise TypeError(f"invalid type {dtype} for astype") - - elif is_categorical_dtype(dtype): - return True - - return False - def external_values(self): """ The array that Series.values returns (public attribute). @@ -230,14 +214,12 @@ def get_values(self, dtype=None): return self.values.astype(object) return self.values - def get_block_values(self, dtype=None): + def get_block_values_for_json(self) -> np.ndarray: """ - This is used in the JSON C code + This is used in the JSON C code. """ - return self.get_values(dtype=dtype) - - def to_dense(self): - return self.values.view() + # TODO(2DEA): reshape will be unnecessary with 2D EAs + return np.asarray(self.values).reshape(self.shape) @property def fill_value(self): @@ -254,14 +236,6 @@ def mgr_locs(self, new_mgr_locs): self._mgr_locs = new_mgr_locs - @property - def array_dtype(self): - """ - the dtype to return if I want to construct this block as an - array - """ - return self.dtype - def make_block(self, values, placement=None) -> "Block": """ Create a new block, with type inference propagate any values that are @@ -269,6 +243,8 @@ def make_block(self, values, placement=None) -> "Block": """ if placement is None: placement = self.mgr_locs + if self.is_extension: + values = _block_shape(values, ndim=self.ndim) return make_block(values, placement=placement, ndim=self.ndim) @@ -305,6 +281,7 @@ def __setstate__(self, state): def _slice(self, slicer): """ return a slice of my values """ + return self.values[slicer] def getitem_block(self, slicer, new_mgr_locs=None): @@ -332,42 +309,31 @@ def shape(self): def dtype(self): return self.values.dtype - @property - def ftype(self): - if getattr(self.values, "_pandas_ftype", False): - dtype = self.dtype.subtype - else: - dtype = self.dtype - return f"{dtype}:{self._ftype}" - - def merge(self, other): - return _merge_blocks([self, other]) - - def concat_same_type(self, to_concat, placement=None): + def concat_same_type(self, to_concat): """ Concatenate list of single blocks of the same type. """ values = self._concatenator( [blk.values for blk in to_concat], axis=self.ndim - 1 ) - return self.make_block_same_class( - values, placement=placement or slice(0, len(values), 1) - ) + placement = self.mgr_locs if self.ndim == 2 else slice(len(values)) + return self.make_block_same_class(values, placement=placement) def iget(self, i): return self.values[i] def set(self, locs, values): """ - Modify Block in-place with new item value + Modify block values in-place with new item value. - Returns - ------- - None + Notes + ----- + `set` never creates a new array or new Block, whereas `setitem` _may_ + create a new array and always creates a new Block. """ self.values[locs] = values - def delete(self, loc): + def delete(self, loc) -> None: """ Delete given loc(-s) from block in-place. """ @@ -391,17 +357,18 @@ def _split_op_result(self, result) -> List["Block"]: nbs = [] for i, loc in enumerate(self.mgr_locs): vals = result[i] - nv = _block_shape(vals, ndim=self.ndim) - block = self.make_block(values=nv, placement=[loc]) + block = self.make_block(values=vals, placement=[loc]) nbs.append(block) return nbs if not isinstance(result, Block): - result = self.make_block(values=_block_shape(result, ndim=self.ndim)) + result = self.make_block(result) return [result] - def fillna(self, value, limit=None, inplace=False, downcast=None): + def fillna( + self, value, limit=None, inplace: bool = False, downcast=None + ) -> List["Block"]: """ fillna on the block with the value. If we fail, then convert to ObjectBlock and try again @@ -415,9 +382,9 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): if not self._can_hold_na: if inplace: - return self + return [self] else: - return self.copy() + return [self.copy()] if self._can_hold_element(value): # equivalent: _try_coerce_args(value) would not raise @@ -426,7 +393,7 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): # we can't process the value, but nothing to do if not mask.any(): - return self if inplace else self.copy() + return [self] if inplace else [self.copy()] # operate column-by-column def f(mask, val, idx): @@ -440,7 +407,7 @@ def f(mask, val, idx): return self.split_and_operate(None, f, inplace) - def split_and_operate(self, mask, f, inplace: bool): + def split_and_operate(self, mask, f, inplace: bool) -> List["Block"]: """ split the block per-column, and apply the callable f per-column, return a new block for each. Handle @@ -581,7 +548,7 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): raise TypeError(msg) # may need to convert to categorical - if self.is_categorical_astype(dtype): + if is_categorical_dtype(dtype): if is_categorical_dtype(self.values): # GH 10696/18593: update an existing categorical efficiently @@ -610,7 +577,9 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): # astype formatting else: - values = self.get_values() + # Because we have neither is_extension nor is_datelike, + # self.values already has the correct shape + values = self.values else: values = self.get_values(dtype=dtype) @@ -666,12 +635,24 @@ def _can_hold_element(self, element: Any) -> bool: return issubclass(tipo.type, dtype) return isinstance(element, dtype) - def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): - """ convert to our native types format, slicing if desired """ - values = self.get_values() + def should_store(self, value: ArrayLike) -> bool: + """ + Should we set self.values[indexer] = value inplace or do we need to cast? + + Parameters + ---------- + value : np.ndarray or ExtensionArray + + Returns + ------- + bool + """ + return is_dtype_equal(value.dtype, self.dtype) + + def to_native_types(self, na_rep="nan", quoting=None, **kwargs): + """ convert to our native types format """ + values = self.values - if slicer is not None: - values = values[:, slicer] mask = isna(values) itemsize = writers.word_len(na_rep) @@ -687,7 +668,7 @@ def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): return values # block actions # - def copy(self, deep=True): + def copy(self, deep: bool = True): """ copy constructor """ values = self.values if deep: @@ -695,7 +676,12 @@ def copy(self, deep=True): return self.make_block_same_class(values, ndim=self.ndim) def replace( - self, to_replace, value, inplace=False, filter=None, regex=False, convert=True + self, + to_replace, + value, + inplace: bool = False, + regex: bool = False, + convert: bool = True, ): """ replace the to_replace value with value, possible to create new @@ -725,12 +711,7 @@ def replace( # _can_hold_element checks have reduced this back to the # scalar case and we can avoid a costly object cast return self.replace( - to_replace[0], - value, - inplace=inplace, - filter=filter, - regex=regex, - convert=convert, + to_replace[0], value, inplace=inplace, regex=regex, convert=convert, ) # GH 22083, TypeError or ValueError occurred within error handling @@ -744,7 +725,6 @@ def replace( to_replace=to_replace, value=value, inplace=inplace, - filter=filter, regex=regex, convert=convert, ) @@ -757,9 +737,6 @@ def replace( to_replace = convert_scalar_for_putitemlike(to_replace, values.dtype) mask = missing.mask_missing(values, to_replace) - if filter is not None: - filtered_out = ~self.mgr_locs.isin(filter) - mask[filtered_out.nonzero()[0]] = False if not mask.any(): if inplace: @@ -788,7 +765,6 @@ def replace( to_replace=original_to_replace, value=value, inplace=inplace, - filter=filter, regex=regex, convert=convert, ) @@ -802,7 +778,7 @@ def _replace_single(self, *args, **kwargs): def setitem(self, indexer, value): """ - Set the value inplace, returning a a maybe different typed block. + Attempt self.values[indexer] = value, possibly creating a new array. Parameters ---------- @@ -840,21 +816,24 @@ def setitem(self, indexer, value): else: # current dtype cannot store value, coerce to common dtype - find_dtype = False if hasattr(value, "dtype"): dtype = value.dtype - find_dtype = True elif lib.is_scalar(value) and not isna(value): dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True) - find_dtype = True - if find_dtype: - dtype = find_common_type([values.dtype, dtype]) - if not is_dtype_equal(self.dtype, dtype): - b = self.astype(dtype) - return b.setitem(indexer, value) + else: + # e.g. we are bool dtype and value is nan + # TODO: watch out for case with listlike value and scalar/empty indexer + dtype, _ = maybe_promote(np.array(value).dtype) + return self.astype(dtype).setitem(indexer, value) + + dtype = find_common_type([values.dtype, dtype]) + assert not is_dtype_equal(self.dtype, dtype) + # otherwise should have _can_hold_element + + return self.astype(dtype).setitem(indexer, value) # value must be storeable at this moment if is_extension_array_dtype(getattr(value, "dtype", None)): @@ -864,11 +843,6 @@ def setitem(self, indexer, value): else: arr_value = np.array(value) - # cast the values to a type that can hold nan (if necessary) - if not self._can_hold_element(value): - dtype, _ = maybe_promote(arr_value.dtype) - values = values.astype(dtype) - if transpose: values = values.T @@ -883,16 +857,12 @@ def setitem(self, indexer, value): # GH#8669 empty indexers pass - elif is_scalar_indexer(indexer, arr_value): + elif is_scalar_indexer(indexer, self.ndim): # setting a single element for each dim and with a rhs that could # be e.g. a list; see GH#6043 values[indexer] = value - elif ( - exact_match - and is_categorical_dtype(arr_value.dtype) - and not is_categorical_dtype(values) - ): + elif exact_match and is_categorical_dtype(arr_value.dtype): # GH25495 - If the current dtype is not categorical, # we need to create a new categorical block values[indexer] = value @@ -901,12 +871,10 @@ def setitem(self, indexer, value): # if we are an exact match (ex-broadcasting), # then use the resultant dtype elif exact_match: + # We are setting _all_ of the array's values, so can cast to new dtype values[indexer] = value - try: - values = values.astype(arr_value.dtype) - except ValueError: - pass + values = values.astype(arr_value.dtype, copy=False) # set else: @@ -917,31 +885,33 @@ def setitem(self, indexer, value): block = self.make_block(values) return block - def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False): + def putmask( + self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False, + ) -> List["Block"]: """ putmask the data to the block; it is possible that we may create a new dtype of block - return the resulting block(s) + Return the resulting block(s). Parameters ---------- - mask : the condition to respect + mask : np.ndarray[bool], SparseArray[bool], or BooleanArray new : a ndarray/object - align : boolean, perform alignment on other/cond, default is True - inplace : perform inplace modification, default is False + inplace : bool, default False + Perform inplace modification. axis : int - transpose : boolean - Set to True if self is stored with axes reversed + transpose : bool, default False + Set to True if self is stored with axes reversed. Returns ------- - a list of new blocks, the result of the putmask + List[Block] """ - new_values = self.values if inplace else self.values.copy() + mask = _extract_bool_array(mask) + assert not isinstance(new, (ABCIndexClass, ABCSeries, ABCDataFrame)) - new = getattr(new, "values", new) - mask = getattr(mask, "values", mask) + new_values = self.values if inplace else self.values.copy() # if we are passed a scalar None, convert it here if not is_list_like(new) and isna(new) and not self.is_object: @@ -1107,7 +1077,6 @@ def interpolate( method="pad", axis=0, index=None, - values=None, inplace=False, limit=None, limit_direction="forward", @@ -1157,7 +1126,6 @@ def check_int_bool(self, inplace): return self._interpolate( method=m, index=index, - values=values, axis=axis, limit=limit, limit_direction=limit_direction, @@ -1177,7 +1145,7 @@ def _interpolate_with_fill( fill_value=None, coerce=False, downcast=None, - ): + ) -> List["Block"]: """ fillna but using the interpolate machinery """ inplace = validate_bool_kwarg(inplace, "inplace") @@ -1211,7 +1179,6 @@ def _interpolate( self, method=None, index=None, - values=None, fill_value=None, axis=0, limit=None, @@ -1220,7 +1187,7 @@ def _interpolate( inplace=False, downcast=None, **kwargs, - ): + ) -> List["Block"]: """ interpolate using scipy wrappers """ inplace = validate_bool_kwarg(inplace, "inplace") data = self.values if inplace else self.values.copy() @@ -1228,7 +1195,7 @@ def _interpolate( # only deal with floats if not self.is_float: if not self.is_integer: - return self + return [self] data = data.astype(np.float64) if fill_value is None: @@ -1264,7 +1231,7 @@ def func(x): blocks = [self.make_block_same_class(interp_values)] return self._maybe_downcast(blocks, downcast) - def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): + def take_nd(self, indexer, axis: int, new_mgr_locs=None, fill_value=lib.no_default): """ Take values according to indexer and return them as a block.bb @@ -1275,11 +1242,10 @@ def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): values = self.values - if fill_tuple is None: + if fill_value is lib.no_default: fill_value = self.fill_value allow_fill = False else: - fill_value = fill_tuple[0] allow_fill = True new_values = algos.take_nd( @@ -1300,47 +1266,20 @@ def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): def diff(self, n: int, axis: int = 1) -> List["Block"]: """ return block for the diff of the values """ new_values = algos.diff(self.values, n, axis=axis, stacklevel=7) - # We use block_shape for ExtensionBlock subclasses, which may call here - # via a super. - new_values = _block_shape(new_values, ndim=self.ndim) return [self.make_block(values=new_values)] - def shift(self, periods, axis=0, fill_value=None): + def shift(self, periods: int, axis: int = 0, fill_value=None): """ shift the block by periods, possibly upcast """ # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also new_values, fill_value = maybe_upcast(self.values, fill_value) - # make sure array sent to np.roll is c_contiguous - f_ordered = new_values.flags.f_contiguous - if f_ordered: - new_values = new_values.T - axis = new_values.ndim - axis - 1 - - if np.prod(new_values.shape): - new_values = np.roll(new_values, ensure_platform_int(periods), axis=axis) - - axis_indexer = [slice(None)] * self.ndim - if periods > 0: - axis_indexer[axis] = slice(None, periods) - else: - axis_indexer[axis] = slice(periods, None) - new_values[tuple(axis_indexer)] = fill_value - - # restore original order - if f_ordered: - new_values = new_values.T + new_values = shift(new_values, periods, axis, fill_value) return [self.make_block(new_values)] def where( - self, - other, - cond, - align=True, - errors="raise", - try_cast: bool = False, - axis: int = 0, + self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0, ) -> List["Block"]: """ evaluate the block; return result block(s) from the result @@ -1348,19 +1287,21 @@ def where( Parameters ---------- other : a ndarray/object - cond : the condition to respect - align : boolean, perform alignment on other/cond + cond : np.ndarray[bool], SparseArray[bool], or BooleanArray errors : str, {'raise', 'ignore'}, default 'raise' - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original object - axis : int + axis : int, default 0 Returns ------- - a new block(s), the result of the func + List[Block] """ import pandas.core.computation.expressions as expressions + cond = _extract_bool_array(cond) + assert not isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)) + assert errors in ["raise", "ignore"] transpose = self.ndim == 2 @@ -1369,9 +1310,6 @@ def where( if transpose: values = values.T - other = getattr(other, "_values", getattr(other, "values", other)) - cond = getattr(cond, "values", cond) - # If the default broadcasting would go in the wrong direction, then # explicitly reshape other instead if getattr(other, "ndim", 0) >= 1: @@ -1415,12 +1353,7 @@ def where_func(cond, values, other): # we are explicitly ignoring errors block = self.coerce_to_target_dtype(other) blocks = block.where( - orig_other, - cond, - align=align, - errors=errors, - try_cast=try_cast, - axis=axis, + orig_other, cond, errors=errors, try_cast=try_cast, axis=axis, ) return self._maybe_downcast(blocks, "infer") @@ -1451,18 +1384,13 @@ def equals(self, other) -> bool: return False return array_equivalent(self.values, other.values) - def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): + def _unstack(self, unstacker, fill_value, new_placement): """ Return a list of unstacked blocks of self Parameters ---------- - unstacker_func : callable - Partially applied unstacker. - new_columns : Index - All columns of the unstacked BlockManager. - n_rows : int - Only used in ExtensionBlock._unstack + unstacker : reshape._Unstacker fill_value : int Only used in ExtensionBlock._unstack @@ -1473,19 +1401,20 @@ def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): mask : array_like of bool The mask of columns of `blocks` we should keep. """ - unstacker = unstacker_func(self.values.T) - new_items = unstacker.get_new_columns() - new_placement = new_columns.get_indexer(new_items) - new_values, mask = unstacker.get_new_values() + new_values, mask = unstacker.get_new_values( + self.values.T, fill_value=fill_value + ) mask = mask.any(0) + # TODO: in all tests we have mask.all(); can we rely on that? + new_values = new_values.T[mask] new_placement = new_placement[mask] - blocks = [make_block(new_values, placement=new_placement)] + blocks = [self.make_block_same_class(new_values, placement=new_placement)] return blocks, mask - def quantile(self, qs, interpolation="linear", axis=0): + def quantile(self, qs, interpolation="linear", axis: int = 0): """ compute the quantiles of the @@ -1542,7 +1471,13 @@ def quantile(self, qs, interpolation="linear", axis=0): return make_block(result, placement=np.arange(len(result)), ndim=ndim) def _replace_coerce( - self, to_replace, value, inplace=True, regex=False, convert=False, mask=None + self, + to_replace, + value, + inplace: bool = True, + regex: bool = False, + convert: bool = False, + mask=None, ): """ Replace value corresponding to the given boolean array with another @@ -1554,7 +1489,7 @@ def _replace_coerce( Scalar to replace or regular expression to match. value : object Replacement object. - inplace : bool, default False + inplace : bool, default True Perform inplace modification. regex : bool, default False If true, perform regular expression substitution. @@ -1583,12 +1518,22 @@ def _replace_coerce( return self -class NonConsolidatableMixIn: - """ hold methods for the nonconsolidatable blocks """ +class ExtensionBlock(Block): + """ + Block for holding extension types. + + Notes + ----- + This holds all 3rd-party extension array types. It's also the immediate + parent class for our internal extension types' blocks, CategoricalBlock. + + ExtensionArrays are limited to 1-D. + """ _can_consolidate = False _verify_integrity = False _validate_ndim = False + is_extension = True def __init__(self, values, placement, ndim=None): """ @@ -1599,6 +1544,8 @@ def __init__(self, values, placement, ndim=None): This will call continue to call __init__ for the other base classes mixed in with this Mixin. """ + values = self._maybe_coerce_values(values) + # Placement must be converted to BlockPlacement so that we can check # its length if not isinstance(placement, libinternals.BlockPlacement): @@ -1612,6 +1559,10 @@ def __init__(self, values, placement, ndim=None): ndim = 2 super().__init__(values, placement, ndim=ndim) + if self.ndim == 2 and len(self.mgr_locs) != 1: + # TODO(2DEA): check unnecessary with 2D EAs + raise AssertionError("block.size != values.size") + @property def shape(self): if self.ndim == 1: @@ -1634,36 +1585,27 @@ def iget(self, col): raise IndexError(f"{self} only contains one item") return self.values - def should_store(self, value): + def should_store(self, value: ArrayLike) -> bool: + """ + Can we set the given array-like value inplace? + """ return isinstance(value, self._holder) - def set(self, locs, values, check=False): + def set(self, locs, values): assert locs.tolist() == [0] - self.values = values + self.values[:] = values - def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False): + def putmask( + self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False, + ) -> List["Block"]: """ - putmask the data to the block; we must be a single block and not - generate other blocks - - return the resulting block - - Parameters - ---------- - mask : the condition to respect - new : a ndarray/object - align : boolean, perform alignment on other/cond, default is True - inplace : perform inplace modification, default is False - - Returns - ------- - a new block, the result of the putmask + See Block.putmask.__doc__ """ inplace = validate_bool_kwarg(inplace, "inplace") - # use block's copy logic. - # .values may be an Index which does shallow copy by default - new_values = self.values if inplace else self.copy().values + mask = _extract_bool_array(mask) + + new_values = self.values if inplace else self.values.copy() if isinstance(new, np.ndarray) and len(new) == len(mask): new = new[mask] @@ -1673,57 +1615,6 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False) new_values[mask] = new return [self.make_block(values=new_values)] - def _get_unstack_items(self, unstacker, new_columns): - """ - Get the placement, values, and mask for a Block unstack. - - This is shared between ObjectBlock and ExtensionBlock. They - differ in that ObjectBlock passes the values, while ExtensionBlock - passes the dummy ndarray of positions to be used by a take - later. - - Parameters - ---------- - unstacker : pandas.core.reshape.reshape._Unstacker - new_columns : Index - All columns of the unstacked BlockManager. - - Returns - ------- - new_placement : ndarray[int] - The placement of the new columns in `new_columns`. - new_values : Union[ndarray, ExtensionArray] - The first return value from _Unstacker.get_new_values. - mask : ndarray[bool] - The second return value from _Unstacker.get_new_values. - """ - # shared with ExtensionBlock - new_items = unstacker.get_new_columns() - new_placement = new_columns.get_indexer(new_items) - new_values, mask = unstacker.get_new_values() - - mask = mask.any(0) - return new_placement, new_values, mask - - -class ExtensionBlock(NonConsolidatableMixIn, Block): - """ - Block for holding extension types. - - Notes - ----- - This holds all 3rd-party extension array types. It's also the immediate - parent class for our internal extension types' blocks, CategoricalBlock. - - ExtensionArrays are limited to 1-D. - """ - - is_extension = True - - def __init__(self, values, placement, ndim=None): - values = self._maybe_coerce_values(values) - super().__init__(values, placement, ndim) - def _maybe_coerce_values(self, values): """ Unbox to an extension array. @@ -1757,7 +1648,7 @@ def _can_hold_na(self): return self._holder._can_hold_na @property - def is_view(self): + def is_view(self) -> bool: """Extension arrays are never treated as views.""" return False @@ -1767,7 +1658,7 @@ def is_numeric(self): def setitem(self, indexer, value): """ - Set the value inplace, returning a same-typed block. + Attempt self.values[indexer] = value, possibly creating a new array. This differs from Block.setitem by not allowing setitem to change the dtype of the Block. @@ -1798,22 +1689,14 @@ def setitem(self, indexer, value): def get_values(self, dtype=None): # ExtensionArrays must be iterable, so this works. - values = np.asarray(self.values) - if values.ndim == self.ndim - 1: - values = values.reshape((1,) + values.shape) - return values + return np.asarray(self.values).reshape(self.shape) def array_values(self) -> ExtensionArray: return self.values - def to_dense(self): - return np.asarray(self.values) - - def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): + def to_native_types(self, na_rep="nan", quoting=None, **kwargs): """override to use ExtensionArray astype for the conversion""" values = self.values - if slicer is not None: - values = values[slicer] mask = isna(values) values = np.asarray(values.astype(object)) @@ -1822,14 +1705,14 @@ def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): # we are expected to return a 2-d ndarray return values.reshape(1, len(values)) - def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): + def take_nd( + self, indexer, axis: int = 0, new_mgr_locs=None, fill_value=lib.no_default + ): """ Take values according to indexer and return them as a block. """ - if fill_tuple is None: + if fill_value is lib.no_default: fill_value = None - else: - fill_value = fill_tuple[0] # axis doesn't matter; we are really a single-dim object # but are passed the axis depending on the calling routing @@ -1850,24 +1733,50 @@ def _can_hold_element(self, element: Any) -> bool: return True def _slice(self, slicer): - """ return a slice of my values """ - # slice the category + """ + Return a slice of my values. + + Parameters + ---------- + slicer : slice, ndarray[int], or a tuple of these + Valid (non-reducing) indexer for self.values. + + Returns + ------- + np.ndarray or ExtensionArray + """ # return same dims as we currently have + if not isinstance(slicer, tuple) and self.ndim == 2: + # reached via getitem_block via _slice_take_blocks_ax0 + # TODO(EA2D): wont be necessary with 2D EAs + slicer = (slicer, slice(None)) if isinstance(slicer, tuple) and len(slicer) == 2: - if not com.is_null_slice(slicer[0]): - raise AssertionError("invalid slicing for a 1-ndim categorical") - slicer = slicer[1] + first = slicer[0] + if not isinstance(first, slice): + raise AssertionError( + "invalid slicing for a 1-ndim ExtensionArray", first + ) + # GH#32959 only full-slicers along fake-dim0 are valid + # TODO(EA2D): wont be necessary with 2D EAs + new_locs = self.mgr_locs[first] + if len(new_locs): + # effectively slice(None) + slicer = slicer[1] + else: + raise AssertionError( + "invalid slicing for a 1-ndim ExtensionArray", slicer + ) return self.values[slicer] - def concat_same_type(self, to_concat, placement=None): + def concat_same_type(self, to_concat): """ Concatenate list of single blocks of the same type. """ values = self._holder._concat_same_type([blk.values for blk in to_concat]) - placement = placement or slice(0, len(values), 1) - return self.make_block_same_class(values, ndim=self.ndim, placement=placement) + placement = self.mgr_locs if self.ndim == 2 else slice(len(values)) + return self.make_block_same_class(values, placement=placement) def fillna(self, value, limit=None, inplace=False, downcast=None): values = self.values if inplace else self.values.copy() @@ -1889,16 +1798,20 @@ def interpolate( ) def diff(self, n: int, axis: int = 1) -> List["Block"]: + if axis == 0 and n != 0: + # n==0 case will be a no-op so let is fall through + # Since we only have one column, the result will be all-NA. + # Create this result by shifting along axis=0 past the length of + # our values. + return super().diff(len(self.values), axis=0) if axis == 1: + # TODO(EA2D): unnecessary with 2D EAs # we are by definition 1D. axis = 0 return super().diff(n, axis) def shift( - self, - periods: int, - axis: libinternals.BlockPlacement = 0, - fill_value: Any = None, + self, periods: int, axis: int = 0, fill_value: Any = None, ) -> List["ExtensionBlock"]: """ Shift the block by `periods`. @@ -1915,27 +1828,21 @@ def shift( ] def where( - self, - other, - cond, - align=True, - errors="raise", - try_cast: bool = False, - axis: int = 0, + self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0, ) -> List["Block"]: - if isinstance(other, ABCDataFrame): - # ExtensionArrays are 1-D, so if we get here then - # `other` should be a DataFrame with a single column. - assert other.shape[1] == 1 - other = other.iloc[:, 0] - other = extract_array(other, extract_numpy=True) + cond = _extract_bool_array(cond) + assert not isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)) - if isinstance(cond, ABCDataFrame): - assert cond.shape[1] == 1 - cond = cond.iloc[:, 0] + if isinstance(other, np.ndarray) and other.ndim == 2: + # TODO(EA2D): unnecessary with 2D EAs + assert other.shape[1] == 1 + other = other[:, 0] - cond = extract_array(cond, extract_numpy=True) + if isinstance(cond, np.ndarray) and cond.ndim == 2: + # TODO(EA2D): unnecessary with 2D EAs + assert cond.shape[1] == 1 + cond = cond[:, 0] if lib.is_scalar(other) and isna(other): # The default `other` for Series / Frame is np.nan @@ -1969,24 +1876,19 @@ def where( return [self.make_block_same_class(result, placement=self.mgr_locs)] - @property - def _ftype(self): - return getattr(self.values, "_pandas_ftype", Block._ftype) - - def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): + def _unstack(self, unstacker, fill_value, new_placement): # ExtensionArray-safe unstack. # We override ObjectBlock._unstack, which unstacks directly on the # values of the array. For EA-backed blocks, this would require # converting to a 2-D ndarray of objects. # Instead, we unstack an ndarray of integer positions, followed by # a `take` on the actual values. + n_rows = self.shape[-1] dummy_arr = np.arange(n_rows) - dummy_unstacker = functools.partial(unstacker_func, fill_value=-1) - unstacker = dummy_unstacker(dummy_arr) - new_placement, new_values, mask = self._get_unstack_items( - unstacker, new_columns - ) + new_values, mask = unstacker.get_new_values(dummy_arr, fill_value=-1) + mask = mask.any(0) + # TODO: in all tests we have mask.all(); can we rely on that? blocks = [ self.make_block_same_class( @@ -2044,18 +1946,10 @@ def _can_hold_element(self, element: Any) -> bool: ) def to_native_types( - self, - slicer=None, - na_rep="", - float_format=None, - decimal=".", - quoting=None, - **kwargs, + self, na_rep="", float_format=None, decimal=".", quoting=None, **kwargs, ): - """ convert to our native types format, slicing if desired """ + """ convert to our native types format """ values = self.values - if slicer is not None: - values = values[:, slicer] # see gh-13418: no special formatting is desired at the # output (important for appropriate 'quoting' behaviour), @@ -2083,11 +1977,6 @@ def to_native_types( ) return formatter.get_result_as_array() - def should_store(self, value): - # when inserting a column should not coerce integers to floats - # unnecessarily - return issubclass(value.dtype.type, np.floating) and value.dtype == self.dtype - class ComplexBlock(FloatOrComplexBlock): __slots__ = () @@ -2101,7 +1990,7 @@ def _can_hold_element(self, element: Any) -> bool: element, (float, int, complex, np.float_, np.int_) ) and not isinstance(element, (bool, np.bool_)) - def should_store(self, value): + def should_store(self, value: ArrayLike) -> bool: return issubclass(value.dtype.type, np.complexfloating) @@ -2120,9 +2009,6 @@ def _can_hold_element(self, element: Any) -> bool: ) return is_integer(element) - def should_store(self, value): - return is_integer_dtype(value) and value.dtype == self.dtype - class DatetimeLikeBlockMixin: """Mixin class for DatetimeBlock, DatetimeTZBlock, and TimedeltaBlock.""" @@ -2140,24 +2026,27 @@ def get_values(self, dtype=None): return object dtype as boxed values, such as Timestamps/Timedelta """ if is_object_dtype(dtype): - values = self.values.ravel() - result = self._holder(values).astype(object) - return result.reshape(self.values.shape) + # DTA/TDA constructor and astype can handle 2D + return self._holder(self.values).astype(object) return self.values def internal_values(self): # Override to return DatetimeArray and TimedeltaArray return self.array_values() + def array_values(self): + return self._holder._simple_new(self.values) + def iget(self, key): # GH#31649 we need to wrap scalars in Timestamp/Timedelta - # TODO: this can be removed if we ever have 2D EA - result = super().iget(key) - if isinstance(result, np.datetime64): - result = Timestamp(result) - elif isinstance(result, np.timedelta64): - result = Timedelta(result) - return result + # TODO(EA2D): this can be removed if we ever have 2D EA + return self.array_values().reshape(self.shape)[key] + + def shift(self, periods, axis=0, fill_value=None): + # TODO(EA2D) this is unnecessary if these blocks are backed by 2D EAs + values = self.array_values() + new_values = values.shift(periods, fill_value=fill_value, axis=axis) + return self.make_block_same_class(new_values) class DatetimeBlock(DatetimeLikeBlockMixin, Block): @@ -2188,7 +2077,7 @@ def _maybe_coerce_values(self, values): Overridden by DatetimeTZBlock. """ - if values.dtype != _NS_DTYPE: + if values.dtype != DT64NS_DTYPE: values = conversion.ensure_datetime64ns(values) if isinstance(values, DatetimeArray): @@ -2207,6 +2096,9 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): # if we are passed a datetime64[ns, tz] if is_datetime64tz_dtype(dtype): values = self.values + if copy: + # this should be the only copy + values = values.copy() if getattr(values, "tz", None) is None: values = DatetimeArray(values).tz_localize("UTC") values = values.tz_convert(dtype.tz) @@ -2235,54 +2127,23 @@ def _can_hold_element(self, element: Any) -> bool: return is_valid_nat_for_dtype(element, self.dtype) - def to_native_types( - self, slicer=None, na_rep=None, date_format=None, quoting=None, **kwargs - ): - """ convert to our native types format, slicing if desired """ - values = self.values - i8values = self.values.view("i8") - - if slicer is not None: - values = values[..., slicer] - i8values = i8values[..., slicer] + def to_native_types(self, na_rep="NaT", date_format=None, **kwargs): + """ convert to our native types format """ + dta = self.array_values() - from pandas.io.formats.format import _get_format_datetime64_from_values - - fmt = _get_format_datetime64_from_values(values, date_format) - - result = tslib.format_array_from_datetime( - i8values.ravel(), - tz=getattr(self.values, "tz", None), - format=fmt, - na_rep=na_rep, - ).reshape(i8values.shape) - return np.atleast_2d(result) - - def should_store(self, value): - return ( - issubclass(value.dtype.type, np.datetime64) - and not is_datetime64tz_dtype(value) - and not is_extension_array_dtype(value) + result = dta._format_native_types( + na_rep=na_rep, date_format=date_format, **kwargs ) + return np.atleast_2d(result) def set(self, locs, values): """ - Modify Block in-place with new item value - - Returns - ------- - None + See Block.set.__doc__ """ values = conversion.ensure_datetime64ns(values, copy=False) self.values[locs] = values - def external_values(self): - return np.asarray(self.values.astype("datetime64[ns]", copy=False)) - - def array_values(self) -> ExtensionArray: - return DatetimeArray._simple_new(self.values) - class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): """ implement a datetime64 block with a tz attribute """ @@ -2295,6 +2156,8 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): _can_hold_element = DatetimeBlock._can_hold_element to_native_types = DatetimeBlock.to_native_types fill_value = np.datetime64("NaT", "ns") + should_store = Block.should_store + array_values = ExtensionBlock.array_values @property def _holder(self): @@ -2323,7 +2186,7 @@ def _maybe_coerce_values(self, values): return values @property - def is_view(self): + def is_view(self) -> bool: """ return a boolean if I am possibly a view """ # check the ndarray values of the DatetimeIndex values return self.values._data.base is not None @@ -2353,29 +2216,16 @@ def get_values(self, dtype=None): if is_object_dtype(dtype): values = values.astype(object) - values = np.asarray(values) - - if self.ndim == 2: - # Ensure that our shape is correct for DataFrame. - # ExtensionArrays are always 1-D, even in a DataFrame when - # the analogous NumPy-backed column would be a 2-D ndarray. - values = values.reshape(1, -1) - return values - - def to_dense(self): - # we request M8[ns] dtype here, even though it discards tzinfo, - # as lots of code (e.g. anything using values_from_object) - # expects that behavior. - return np.asarray(self.values, dtype=_NS_DTYPE) + # TODO(EA2D): reshape unnecessary with 2D EAs + # Ensure that our shape is correct for DataFrame. + # ExtensionArrays are always 1-D, even in a DataFrame when + # the analogous NumPy-backed column would be a 2-D ndarray. + return np.asarray(values).reshape(self.shape) - def _slice(self, slicer): - """ return a slice of my values """ - if isinstance(slicer, tuple): - col, loc = slicer - if not com.is_null_slice(col) and col != 0: - raise IndexError(f"{self} only contains one item") - return self.values[loc] - return self.values[slicer] + def external_values(self): + # NB: this is different from np.asarray(self.values), since that + # return an object-dtype ndarray of Timestamps. + return np.asarray(self.values.astype("datetime64[ns]", copy=False)) def diff(self, n: int, axis: int = 0) -> List["Block"]: """ @@ -2408,19 +2258,19 @@ def diff(self, n: int, axis: int = 0) -> List["Block"]: new_values = new_values.astype("timedelta64[ns]") return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)] - def concat_same_type(self, to_concat, placement=None): + def concat_same_type(self, to_concat): # need to handle concat([tz1, tz2]) here, since DatetimeArray # only handles cases where all the tzs are the same. # Instead of placing the condition here, it could also go into the # is_uniform_join_units check, but I'm not sure what is better. if len({x.dtype for x in to_concat}) > 1: values = concat_datetime([x.values for x in to_concat]) - placement = placement or slice(0, len(values), 1) - if self.ndim > 1: - values = np.atleast_2d(values) - return ObjectBlock(values, ndim=self.ndim, placement=placement) - return super().concat_same_type(to_concat, placement) + values = values.astype(object, copy=False) + placement = self.mgr_locs if self.ndim == 2 else slice(len(values)) + + return self.make_block(values, placement=placement) + return super().concat_same_type(to_concat) def fillna(self, value, limit=None, inplace=False, downcast=None): # We support filling a DatetimeTZ with a `value` whose timezone @@ -2476,7 +2326,7 @@ class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock): fill_value = np.timedelta64("NaT", "ns") def __init__(self, values, placement, ndim=None): - if values.dtype != _TD_DTYPE: + if values.dtype != TD64NS_DTYPE: values = conversion.ensure_timedelta64ns(values) if isinstance(values, TimedeltaArray): values = values._data @@ -2510,39 +2360,10 @@ def fillna(self, value, **kwargs): ) return super().fillna(value, **kwargs) - def should_store(self, value): - return issubclass( - value.dtype.type, np.timedelta64 - ) and not is_extension_array_dtype(value) - - def to_native_types(self, slicer=None, na_rep=None, quoting=None, **kwargs): - """ convert to our native types format, slicing if desired """ - values = self.values - if slicer is not None: - values = values[:, slicer] - mask = isna(values) - - rvalues = np.empty(values.shape, dtype=object) - if na_rep is None: - na_rep = "NaT" - rvalues[mask] = na_rep - imask = (~mask).ravel() - - # FIXME: - # should use the formats.format.Timedelta64Formatter here - # to figure what format to pass to the Timedelta - # e.g. to not show the decimals say - rvalues.flat[imask] = np.array( - [Timedelta(val)._repr_base(format="all") for val in values.ravel()[imask]], - dtype=object, - ) - return rvalues - - def external_values(self): - return np.asarray(self.values.astype("timedelta64[ns]", copy=False)) - - def array_values(self) -> ExtensionArray: - return TimedeltaArray._simple_new(self.values) + def to_native_types(self, na_rep="NaT", **kwargs): + """ convert to our native types format """ + tda = self.array_values() + return tda._format_native_types(na_rep, **kwargs) class BoolBlock(NumericBlock): @@ -2556,25 +2377,13 @@ def _can_hold_element(self, element: Any) -> bool: return issubclass(tipo.type, np.bool_) return isinstance(element, (bool, np.bool_)) - def should_store(self, value): - return issubclass(value.dtype.type, np.bool_) and not is_extension_array_dtype( - value - ) - - def replace( - self, to_replace, value, inplace=False, filter=None, regex=False, convert=True - ): + def replace(self, to_replace, value, inplace=False, regex=False, convert=True): inplace = validate_bool_kwarg(inplace, "inplace") to_replace_values = np.atleast_1d(to_replace) if not np.can_cast(to_replace_values, bool): return self return super().replace( - to_replace, - value, - inplace=inplace, - filter=filter, - regex=regex, - convert=convert, + to_replace, value, inplace=inplace, regex=regex, convert=convert, ) @@ -2626,7 +2435,6 @@ def f(mask, val, idx): # TODO: allow EA once reshape is supported values = values.reshape(shape) - values = _block_shape(values, ndim=self.ndim) return values if self.ndim == 2: @@ -2648,18 +2456,7 @@ def _maybe_downcast(self, blocks: List["Block"], downcast=None) -> List["Block"] def _can_hold_element(self, element: Any) -> bool: return True - def should_store(self, value): - return not ( - issubclass( - value.dtype.type, - (np.integer, np.floating, np.complexfloating, np.datetime64, np.bool_), - ) - or is_extension_array_dtype(value) - ) - - def replace( - self, to_replace, value, inplace=False, filter=None, regex=False, convert=True - ): + def replace(self, to_replace, value, inplace=False, regex=False, convert=True): to_rep_is_list = is_list_like(to_replace) value_is_list = is_list_like(value) both_lists = to_rep_is_list and value_is_list @@ -2670,33 +2467,18 @@ def replace( if not either_list and is_re(to_replace): return self._replace_single( - to_replace, - value, - inplace=inplace, - filter=filter, - regex=True, - convert=convert, + to_replace, value, inplace=inplace, regex=True, convert=convert, ) elif not (either_list or regex): return super().replace( - to_replace, - value, - inplace=inplace, - filter=filter, - regex=regex, - convert=convert, + to_replace, value, inplace=inplace, regex=regex, convert=convert, ) elif both_lists: for to_rep, v in zip(to_replace, value): result_blocks = [] for b in blocks: result = b._replace_single( - to_rep, - v, - inplace=inplace, - filter=filter, - regex=regex, - convert=convert, + to_rep, v, inplace=inplace, regex=regex, convert=convert, ) result_blocks = _extend_blocks(result, result_blocks) blocks = result_blocks @@ -2707,35 +2489,18 @@ def replace( result_blocks = [] for b in blocks: result = b._replace_single( - to_rep, - value, - inplace=inplace, - filter=filter, - regex=regex, - convert=convert, + to_rep, value, inplace=inplace, regex=regex, convert=convert, ) result_blocks = _extend_blocks(result, result_blocks) blocks = result_blocks return result_blocks return self._replace_single( - to_replace, - value, - inplace=inplace, - filter=filter, - convert=convert, - regex=regex, + to_replace, value, inplace=inplace, convert=convert, regex=regex, ) def _replace_single( - self, - to_replace, - value, - inplace=False, - filter=None, - regex=False, - convert=True, - mask=None, + self, to_replace, value, inplace=False, regex=False, convert=True, mask=None, ): """ Replace elements by the given value. @@ -2748,7 +2513,6 @@ def _replace_single( Replacement object. inplace : bool, default False Perform inplace modification. - filter : list, optional regex : bool, default False If true, perform regular expression substitution. convert : bool, default True @@ -2794,9 +2558,7 @@ def _replace_single( else: # if the thing to replace is not a string or compiled regex call # the superclass method -> to_replace is some kind of object - return super().replace( - to_replace, value, inplace=inplace, filter=filter, regex=regex - ) + return super().replace(to_replace, value, inplace=inplace, regex=regex) new_values = self.values if inplace else self.values.copy() @@ -2821,15 +2583,10 @@ def re_replacer(s): f = np.vectorize(re_replacer, otypes=[self.dtype]) - if filter is None: - filt = slice(None) - else: - filt = self.mgr_locs.isin(filter).nonzero()[0] - if mask is None: - new_values[filt] = f(new_values[filt]) + new_values[:] = f(new_values) else: - new_values[filt][mask] = f(new_values[filt][mask]) + new_values[mask] = f(new_values[mask]) # convert block = self.make_block(new_values) @@ -2887,6 +2644,8 @@ class CategoricalBlock(ExtensionBlock): _can_hold_na = True _concatenator = staticmethod(concat_categorical) + should_store = Block.should_store + def __init__(self, values, placement, ndim=None): # coerce to categorical if we can values = extract_array(values) @@ -2897,34 +2656,7 @@ def __init__(self, values, placement, ndim=None): def _holder(self): return Categorical - @property - def array_dtype(self): - """ - the dtype to return if I want to construct this block as an - array - """ - return np.object_ - - def to_dense(self): - # Categorical.get_values returns a DatetimeIndex for datetime - # categories, so we can't simply use `np.asarray(self.values)` like - # other types. - return self.values._internal_get_values() - - def to_native_types(self, slicer=None, na_rep="", quoting=None, **kwargs): - """ convert to our native types format, slicing if desired """ - values = self.values - if slicer is not None: - # Categorical is always one dimension - values = values[slicer] - mask = isna(values) - values = np.array(values, dtype="object") - values[mask] = na_rep - - # we are expected to return a 2-d ndarray - return values.reshape(1, len(values)) - - def concat_same_type(self, to_concat, placement=None): + def concat_same_type(self, to_concat): """ Concatenate list of single blocks of the same type. @@ -2940,34 +2672,23 @@ def concat_same_type(self, to_concat, placement=None): values = self._concatenator( [blk.values for blk in to_concat], axis=self.ndim - 1 ) + placement = self.mgr_locs if self.ndim == 2 else slice(len(values)) # not using self.make_block_same_class as values can be object dtype - return make_block( - values, placement=placement or slice(0, len(values), 1), ndim=self.ndim - ) + return self.make_block(values, placement=placement) def replace( self, to_replace, value, inplace: bool = False, - filter=None, regex: bool = False, convert: bool = True, ): inplace = validate_bool_kwarg(inplace, "inplace") result = self if inplace else self.copy() - if filter is None: # replace was called on a series - result.values.replace(to_replace, value, inplace=True) - if convert: - return result.convert(numeric=False, copy=not inplace) - else: - return result - else: # replace was called on a DataFrame - if not isna(value): - result.values.add_categories(value, inplace=True) - return super(CategoricalBlock, result).replace( - to_replace, value, inplace, filter, regex, convert - ) + + result.values.replace(to_replace, value, inplace=True) + return result # ----------------------------------------------------------------- @@ -2993,7 +2714,7 @@ def get_block_type(values, dtype=None): if is_sparse(dtype): # Need this first(ish) so that Sparse[datetime] is sparse cls = ExtensionBlock - elif is_categorical(values): + elif is_categorical_dtype(values.dtype): cls = CategoricalBlock elif issubclass(vtype, np.datetime64): assert not is_datetime64tz_dtype(values) @@ -3062,45 +2783,18 @@ def _extend_blocks(result, blocks=None): return blocks -def _block_shape(values, ndim=1, shape=None): +def _block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: """ guarantee the shape of the values to be at least 1 d """ if values.ndim < ndim: - if shape is None: - shape = values.shape - if not is_extension_array_dtype(values): - # TODO: https://github.com/pandas-dev/pandas/issues/23023 + shape = values.shape + if not is_extension_array_dtype(values.dtype): + # TODO(EA2D): https://github.com/pandas-dev/pandas/issues/23023 # block.shape is incorrect for "2D" ExtensionArrays # We can't, and don't need to, reshape. - values = values.reshape(tuple((1,) + shape)) + values = values.reshape(tuple((1,) + shape)) # type: ignore return values -def _merge_blocks(blocks, dtype=None, _can_consolidate=True): - - if len(blocks) == 1: - return blocks[0] - - if _can_consolidate: - - if dtype is None: - if len({b.dtype for b in blocks}) != 1: - raise AssertionError("_merge_blocks are invalid!") - - # FIXME: optimization potential in case all mgrs contain slices and - # combination of those slices is a slice, too. - new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks]) - new_values = np.vstack([b.values for b in blocks]) - - argsort = np.argsort(new_mgr_locs) - new_values = new_values[argsort] - new_mgr_locs = new_mgr_locs[argsort] - - return make_block(new_values, placement=new_mgr_locs) - - # no merge - return blocks - - def _safe_reshape(arr, new_shape): """ If possible, reshape `arr` to have shape `new_shape`, @@ -3123,14 +2817,15 @@ def _safe_reshape(arr, new_shape): return arr -def _putmask_smart(v, mask, n): +def _putmask_smart(v: np.ndarray, mask: np.ndarray, n) -> np.ndarray: """ Return a new ndarray, try to preserve dtype if possible. Parameters ---------- - v : `values`, updated in-place (array like) - mask : np.ndarray + v : np.ndarray + `values`, updated in-place. + mask : np.ndarray[bool] Applies to both sides (array like). n : `new values` either scalar or an array like aligned with `values` @@ -3197,9 +2892,19 @@ def _putmask_preserve(nv, n): # change the dtype if needed dtype, _ = maybe_promote(n.dtype) - if is_extension_array_dtype(v.dtype) and is_object_dtype(dtype): - v = v._internal_get_values(dtype) - else: - v = v.astype(dtype) + v = v.astype(dtype) return _putmask_preserve(v, n) + + +def _extract_bool_array(mask: ArrayLike) -> np.ndarray: + """ + If we have a SparseArray or BooleanArray, convert it to ndarray[bool]. + """ + if isinstance(mask, ExtensionArray): + # We could have BooleanArray, Sparse[bool], ... + mask = np.asarray(mask, dtype=np.bool_) + + assert isinstance(mask, np.ndarray), type(mask) + assert mask.dtype == bool, mask.dtype + return mask diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 515e1bcd761b6..720e6799a3bf3 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -23,9 +23,57 @@ from pandas.core.dtypes.missing import isna import pandas.core.algorithms as algos +from pandas.core.internals.blocks import make_block +from pandas.core.internals.managers import BlockManager -def get_mgr_concatenation_plan(mgr, indexers): +def concatenate_block_managers( + mgrs_indexers, axes, concat_axis: int, copy: bool +) -> BlockManager: + """ + Concatenate block managers into one. + + Parameters + ---------- + mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples + axes : list of Index + concat_axis : int + copy : bool + + Returns + ------- + BlockManager + """ + concat_plans = [ + _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers + ] + concat_plan = _combine_concat_plans(concat_plans, concat_axis) + blocks = [] + + for placement, join_units in concat_plan: + + if len(join_units) == 1 and not join_units[0].indexers: + b = join_units[0].block + values = b.values + if copy: + values = values.copy() + else: + values = values.view() + b = b.make_block_same_class(values, placement=placement) + elif _is_uniform_join_units(join_units): + b = join_units[0].block.concat_same_type([ju.block for ju in join_units]) + b.mgr_locs = placement + else: + b = make_block( + _concatenate_join_units(join_units, concat_axis, copy=copy), + placement=placement, + ) + blocks.append(b) + + return BlockManager(blocks, axes) + + +def _get_mgr_concatenation_plan(mgr, indexers): """ Construct concatenation plan for given block manager and indexers. @@ -48,8 +96,8 @@ def get_mgr_concatenation_plan(mgr, indexers): if 0 in indexers: ax0_indexer = indexers.pop(0) - blknos = algos.take_1d(mgr._blknos, ax0_indexer, fill_value=-1) - blklocs = algos.take_1d(mgr._blklocs, ax0_indexer, fill_value=-1) + blknos = algos.take_1d(mgr.blknos, ax0_indexer, fill_value=-1) + blklocs = algos.take_1d(mgr.blklocs, ax0_indexer, fill_value=-1) else: if mgr._is_single_block: @@ -57,8 +105,8 @@ def get_mgr_concatenation_plan(mgr, indexers): return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))] ax0_indexer = None - blknos = mgr._blknos - blklocs = mgr._blklocs + blknos = mgr.blknos + blklocs = mgr.blklocs plan = [] for blkno, placements in libinternals.get_blkno_placements(blknos, group=False): @@ -217,7 +265,7 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): else: # No dtype upcasting is done here, it will be performed during # concatenation itself. - values = self.block.get_values() + values = self.block.values if not self.indexers: # If there's no indexing to be done, we want to signal outside @@ -232,7 +280,7 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): return values -def concatenate_join_units(join_units, concat_axis, copy): +def _concatenate_join_units(join_units, concat_axis, copy): """ Concatenate values from several join units along selected axis. """ @@ -371,11 +419,11 @@ def _get_empty_dtype_and_na(join_units): raise AssertionError(msg) -def is_uniform_join_units(join_units) -> bool: +def _is_uniform_join_units(join_units) -> bool: """ Check if the join units consist of blocks of uniform type that can be concatenated using Block.concat_same_type instead of the generic - concatenate_join_units (which uses `concat_compat`). + _concatenate_join_units (which uses `concat_compat`). """ return ( @@ -429,7 +477,7 @@ def _trim_join_unit(join_unit, length): return JoinUnit(block=extra_block, indexers=extra_indexers, shape=extra_shape) -def combine_concat_plans(plans, concat_axis): +def _combine_concat_plans(plans, concat_axis): """ Combine multiple concatenation plans into one. diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 57ed2555761be..5c9e4b96047ee 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -3,11 +3,13 @@ constructors before passing them to a BlockManager. """ from collections import abc +from typing import Dict, List, Optional, Tuple, Union import numpy as np import numpy.ma as ma from pandas._libs import lib +from pandas._typing import Axis, Dtype, Scalar from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, @@ -29,14 +31,13 @@ ABCDataFrame, ABCDatetimeIndex, ABCIndexClass, - ABCPeriodIndex, ABCSeries, ABCTimedeltaIndex, ) from pandas.core import algorithms, common as com from pandas.core.arrays import Categorical -from pandas.core.construction import sanitize_array +from pandas.core.construction import extract_array, sanitize_array from pandas.core.indexes import base as ibase from pandas.core.indexes.api import ( Index, @@ -44,7 +45,7 @@ get_objs_combined_axis, union_indexes, ) -from pandas.core.internals import ( +from pandas.core.internals.managers import ( create_block_manager_from_arrays, create_block_manager_from_blocks, ) @@ -53,23 +54,33 @@ # BlockManager Interface -def arrays_to_mgr(arrays, arr_names, index, columns, dtype=None): +def arrays_to_mgr( + arrays, arr_names, index, columns, dtype=None, verify_integrity: bool = True +): """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. """ - # figure out the index, if necessary - if index is None: - index = extract_index(arrays) + arr_names = ensure_index(arr_names) + + if verify_integrity: + # figure out the index, if necessary + if index is None: + index = extract_index(arrays) + else: + index = ensure_index(index) + + # don't force copy because getting jammed in an ndarray anyway + arrays = _homogenize(arrays, index, dtype) + + columns = ensure_index(columns) else: + columns = ensure_index(columns) index = ensure_index(index) - # don't force copy because getting jammed in an ndarray anyway - arrays = _homogenize(arrays, index, dtype) - # from BlockManager perspective - axes = [ensure_index(columns), index] + axes = [columns, index] return create_block_manager_from_arrays(arrays, arr_names, axes) @@ -160,7 +171,8 @@ def init_ndarray(values, index, columns, dtype=None, copy=False): values = [values] if columns is None: - columns = list(range(len(values))) + columns = Index(range(len(values))) + return arrays_to_mgr(values, columns, index, columns, dtype=dtype) # by definition an array here @@ -413,7 +425,7 @@ def get_names_from_index(data): return index -def _get_axes(N, K, index, columns): +def _get_axes(N, K, index, columns) -> Tuple[Index, Index]: # helper to create the axes as indexes # return axes or defaults @@ -429,6 +441,33 @@ def _get_axes(N, K, index, columns): return index, columns +def dataclasses_to_dicts(data): + """ Converts a list of dataclass instances to a list of dictionaries + + Parameters + ---------- + data : List[Type[dataclass]] + + Returns + -------- + list_dict : List[dict] + + Examples + -------- + >>> @dataclass + >>> class Point: + ... x: int + ... y: int + + >>> dataclasses_to_dicts([Point(1,2), Point(2,3)]) + [{"x":1,"y":2},{"x":2,"y":3}] + + """ + from dataclasses import asdict + + return list(map(asdict, data)) + + # --------------------------------------------------------------------- # Conversion of Inputs to Arrays @@ -484,7 +523,12 @@ def to_arrays(data, columns, coerce_float=False, dtype=None): return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) -def _list_to_arrays(data, columns, coerce_float=False, dtype=None): +def _list_to_arrays( + data: List[Scalar], + columns: Union[Index, List], + coerce_float: bool = False, + dtype: Optional[Dtype] = None, +) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: if len(data) > 0 and isinstance(data[0], tuple): content = list(lib.to_object_array_tuples(data).T) else: @@ -492,21 +536,25 @@ def _list_to_arrays(data, columns, coerce_float=False, dtype=None): content = list(lib.to_object_array(data).T) # gh-26429 do not raise user-facing AssertionError try: - result = _convert_object_array( - content, columns, dtype=dtype, coerce_float=coerce_float - ) + columns = _validate_or_indexify_columns(content, columns) + result = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) except AssertionError as e: raise ValueError(e) from e - return result + return result, columns -def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): +def _list_of_series_to_arrays( + data: List, + columns: Union[Index, List], + coerce_float: bool = False, + dtype: Optional[Dtype] = None, +) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: if columns is None: # We know pass_data is non-empty because data[0] is a Series pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))] columns = get_objs_combined_axis(pass_data, sort=False) - indexer_cache = {} + indexer_cache: Dict[int, Scalar] = {} aligned_values = [] for s in data: @@ -519,21 +567,26 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): else: indexer = indexer_cache[id(index)] = index.get_indexer(columns) - values = com.values_from_object(s) + values = extract_array(s, extract_numpy=True) aligned_values.append(algorithms.take_1d(values, indexer)) values = np.vstack(aligned_values) if values.dtype == np.object_: content = list(values.T) - return _convert_object_array( - content, columns, dtype=dtype, coerce_float=coerce_float - ) + columns = _validate_or_indexify_columns(content, columns) + content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) + return content, columns else: return values.T, columns -def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): +def _list_of_dict_to_arrays( + data: List, + columns: Union[Index, List], + coerce_float: bool = False, + dtype: Optional[Dtype] = None, +) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: """ Convert list of dicts to numpy arrays @@ -565,22 +618,85 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): data = [(type(d) is dict) and d or dict(d) for d in data] content = list(lib.dicts_to_array(data, list(columns)).T) - return _convert_object_array( - content, columns, dtype=dtype, coerce_float=coerce_float - ) + columns = _validate_or_indexify_columns(content, columns) + content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) + return content, columns + +def _validate_or_indexify_columns( + content: List, columns: Union[Index, List, None] +) -> Union[Index, List[Axis]]: + """ + If columns is None, make numbers as column names; Otherwise, validate that + columns have valid length. + + Parameters + ---------- + content: list of data + columns: Iterable or None -def _convert_object_array(content, columns, coerce_float=False, dtype=None): + Returns + ------- + columns: If columns is Iterable, return as is; If columns is None, assign + positional column index value as columns. + + Raises + ------ + 1. AssertionError when content is not composed of list of lists, and if + length of columns is not equal to length of content. + 2. ValueError when content is list of lists, but length of each sub-list + is not equal + 3. ValueError when content is list of lists, but length of sub-list is + not equal to length of content + """ if columns is None: columns = ibase.default_index(len(content)) else: - if len(columns) != len(content): # pragma: no cover + + # Add mask for data which is composed of list of lists + is_mi_list = isinstance(columns, list) and all( + isinstance(col, list) for col in columns + ) + + if not is_mi_list and len(columns) != len(content): # pragma: no cover # caller's responsibility to check for this... raise AssertionError( f"{len(columns)} columns passed, passed data had " f"{len(content)} columns" ) + elif is_mi_list: + + # check if nested list column, length of each sub-list should be equal + if len({len(col) for col in columns}) > 1: + raise ValueError( + "Length of columns passed for MultiIndex columns is different" + ) + # if columns is not empty and length of sublist is not equal to content + elif columns and len(columns[0]) != len(content): + raise ValueError( + f"{len(columns[0])} columns passed, passed data had " + f"{len(content)} columns" + ) + return columns + + +def _convert_object_array( + content: List[Scalar], coerce_float: bool = False, dtype: Optional[Dtype] = None +) -> List[Scalar]: + """ + Internal function ot convert object array. + + Parameters + ---------- + content: list of processed data records + coerce_float: bool, to coerce floats or not, default is False + dtype: np.dtype, default is None + + Returns + ------- + arrays: casted content if not object dtype, otherwise return as is in list. + """ # provide soft conversion of object dtypes def convert(arr): if dtype != object and dtype != np.object: @@ -590,7 +706,7 @@ def convert(arr): arrays = [convert(arr) for arr in content] - return arrays, columns + return arrays # --------------------------------------------------------------------- @@ -605,12 +721,7 @@ def sanitize_index(data, index: Index): if len(data) != len(index): raise ValueError("Length of values does not match length of index") - if isinstance(data, ABCIndexClass): - pass - elif isinstance(data, (ABCPeriodIndex, ABCDatetimeIndex)): - data = data._values - - elif isinstance(data, np.ndarray): + if isinstance(data, np.ndarray): # coerce datetimelike types if data.dtype.kind in ["M", "m"]: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 329bfdf543c62..e693341d10a55 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1,40 +1,41 @@ from collections import defaultdict -from functools import partial import itertools import operator import re -from typing import Dict, List, Optional, Sequence, Tuple, Union +from typing import DefaultDict, Dict, List, Optional, Sequence, Tuple, TypeVar, Union +import warnings import numpy as np -from pandas._libs import Timedelta, Timestamp, internals as libinternals, lib -from pandas._typing import DtypeObj +from pandas._libs import internals as libinternals, lib +from pandas._typing import ArrayLike, DtypeObj, Label, Scalar from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( find_common_type, infer_dtype_from_scalar, - maybe_convert_objects, maybe_promote, ) from pandas.core.dtypes.common import ( - _NS_DTYPE, + DT64NS_DTYPE, is_datetimelike_v_numeric, is_extension_array_dtype, is_list_like, is_numeric_v_string_like, is_scalar, - is_sparse, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.generic import ABCExtensionArray, ABCSeries +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.missing import isna import pandas.core.algorithms as algos +from pandas.core.arrays.sparse import SparseDtype from pandas.core.base import PandasObject +import pandas.core.common as com +from pandas.core.construction import extract_array from pandas.core.indexers import maybe_convert_indices -from pandas.core.indexes.api import Index, MultiIndex, ensure_index +from pandas.core.indexes.api import Index, ensure_index from pandas.core.internals.blocks import ( Block, CategoricalBlock, @@ -42,22 +43,15 @@ ExtensionBlock, ObjectValuesExtensionBlock, _extend_blocks, - _merge_blocks, _safe_reshape, get_block_type, make_block, ) -from pandas.core.internals.concat import ( # all for concatenate_block_managers - combine_concat_plans, - concatenate_join_units, - get_mgr_concatenation_plan, - is_uniform_join_units, -) - -from pandas.io.formats.printing import pprint_thing # TODO: flexible with index=None and/or items=None +T = TypeVar("T", bound="BlockManager") + class BlockManager(PandasObject): """ @@ -114,14 +108,15 @@ class BlockManager(PandasObject): __slots__ = [ "axes", "blocks", - "_ndim", - "_shape", "_known_consolidated", "_is_consolidated", "_blknos", "_blklocs", ] + _blknos: np.ndarray + _blklocs: np.ndarray + def __init__( self, blocks: Sequence[Block], @@ -141,38 +136,77 @@ def __init__( if do_integrity_check: self._verify_integrity() + # Populate known_consolidate, blknos, and blklocs lazily self._known_consolidated = False + self._blknos = None + self._blklocs = None - self._rebuild_blknos_and_blklocs() + @classmethod + def from_blocks(cls, blocks: List[Block], axes: List[Index]): + """ + Constructor for BlockManager and SingleBlockManager with same signature. + """ + return cls(blocks, axes, do_integrity_check=False) + + @property + def blknos(self): + """ + Suppose we want to find the array corresponding to our i'th column. + + blknos[i] identifies the block from self.blocks that contains this column. - def make_empty(self, axes=None): + blklocs[i] identifies the column of interest within + self.blocks[self.blknos[i]] + """ + if self._blknos is None: + # Note: these can be altered by other BlockManager methods. + self._rebuild_blknos_and_blklocs() + + return self._blknos + + @property + def blklocs(self): + """ + See blknos.__doc__ + """ + if self._blklocs is None: + # Note: these can be altered by other BlockManager methods. + self._rebuild_blknos_and_blklocs() + + return self._blklocs + + def make_empty(self: T, axes=None) -> T: """ return an empty BlockManager with the items axis of len 0 """ if axes is None: - axes = [ensure_index([])] + [ensure_index(a) for a in self.axes[1:]] + axes = [Index([])] + self.axes[1:] # preserve dtype if possible if self.ndim == 1: - blocks = np.array([], dtype=self.array_dtype) + assert isinstance(self, SingleBlockManager) # for mypy + blk = self.blocks[0] + arr = blk.values[:0] + nb = blk.make_block_same_class(arr, placement=slice(0, 0), ndim=1) + blocks = [nb] else: blocks = [] - return type(self)(blocks, axes) + return type(self).from_blocks(blocks, axes) - def __nonzero__(self): + def __nonzero__(self) -> bool: return True # Python3 compat __bool__ = __nonzero__ @property - def shape(self): + def shape(self) -> Tuple[int, ...]: return tuple(len(ax) for ax in self.axes) @property def ndim(self) -> int: return len(self.axes) - def set_axis(self, axis, new_labels): - new_labels = ensure_index(new_labels) + def set_axis(self, axis: int, new_labels: Index) -> None: + # Caller is responsible for ensuring we have an Index object. old_len = len(self.axes[axis]) new_len = len(new_labels) @@ -184,21 +218,6 @@ def set_axis(self, axis, new_labels): self.axes[axis] = new_labels - def rename_axis(self, mapper, axis, copy: bool = True, level=None): - """ - Rename one of axes. - - Parameters - ---------- - mapper : unary callable - axis : int - copy : bool, default True - level : int, default None - """ - obj = self.copy(deep=copy) - obj.set_axis(axis, _transform_index(self.axes[axis], mapper, level)) - return obj - @property def _is_single_block(self) -> bool: if self.ndim == 1: @@ -212,7 +231,7 @@ def _is_single_block(self) -> bool: 0, len(self), 1 ) - def _rebuild_blknos_and_blklocs(self): + def _rebuild_blknos_and_blklocs(self) -> None: """ Update mgr._blknos / mgr._blklocs. """ @@ -227,13 +246,14 @@ def _rebuild_blknos_and_blklocs(self): new_blklocs[rl.indexer] = np.arange(len(rl)) if (new_blknos == -1).any(): + # TODO: can we avoid this? it isn't cheap raise AssertionError("Gaps in blk ref_locs") self._blknos = new_blknos self._blklocs = new_blklocs @property - def items(self): + def items(self) -> Index: return self.axes[0] def _get_counts(self, f): @@ -250,7 +270,7 @@ def get_dtype_counts(self): def get_dtypes(self): dtypes = np.array([blk.dtype for blk in self.blocks]) - return algos.take_1d(dtypes, self._blknos, allow_fill=False) + return algos.take_1d(dtypes, self.blknos, allow_fill=False) def __getstate__(self): block_values = [b.values for b in self.blocks] @@ -286,7 +306,7 @@ def unpickle_block(values, mgr_locs): self._post_setstate() - def _post_setstate(self): + def _post_setstate(self) -> None: self._is_consolidated = False self._known_consolidated = False self._rebuild_blknos_and_blklocs() @@ -303,15 +323,15 @@ def __repr__(self) -> str: output += f"\nAxis {i}: {ax}" for block in self.blocks: - output += f"\n{pprint_thing(block)}" + output += f"\n{block}" return output - def _verify_integrity(self): + def _verify_integrity(self) -> None: mgr_shape = self.shape tot_items = sum(len(x.mgr_locs) for x in self.blocks) for block in self.blocks: if block._verify_integrity and block.shape[1:] != mgr_shape[1:]: - construction_error(tot_items, block.shape[1:], self.axes) + raise construction_error(tot_items, block.shape[1:], self.axes) if len(self.items) != tot_items: raise AssertionError( "Number of manager items must equal union of " @@ -345,7 +365,7 @@ def reduce(self, func, *args, **kwargs): return res - def apply(self, f, filter=None, **kwargs): + def apply(self: T, f, align_keys=None, **kwargs) -> T: """ Iterate over the blocks, collect and create a new BlockManager. @@ -353,67 +373,37 @@ def apply(self, f, filter=None, **kwargs): ---------- f : str or callable Name of the Block method to apply. - filter : list, if supplied, only call the block if the filter is in - the block Returns ------- BlockManager """ - result_blocks = [] + assert "filter" not in kwargs - # filter kwarg is used in replace-* family of methods - if filter is not None: - filter_locs = set(self.items.get_indexer_for(filter)) - if len(filter_locs) == len(self.items): - # All items are included, as if there were no filtering - filter = None - else: - kwargs["filter"] = filter_locs + align_keys = align_keys or [] + result_blocks: List[Block] = [] + # fillna: Series/DataFrame is responsible for making sure value is aligned self._consolidate_inplace() + align_copy = False if f == "where": align_copy = True - if kwargs.get("align", True): - align_keys = ["other", "cond"] - else: - align_keys = ["cond"] - elif f == "putmask": - align_copy = False - if kwargs.get("align", True): - align_keys = ["new", "mask"] - else: - align_keys = ["mask"] - elif f == "fillna": - # fillna internally does putmask, maybe it's better to do this - # at mgr, not block level? - align_copy = False - align_keys = ["value"] - else: - align_keys = [] - # TODO(EA): may interfere with ExtensionBlock.setitem for blocks - # with a .values attribute. aligned_args = { k: kwargs[k] for k in align_keys - if not isinstance(kwargs[k], ABCExtensionArray) - and hasattr(kwargs[k], "values") + if isinstance(kwargs[k], (ABCSeries, ABCDataFrame)) } for b in self.blocks: - if filter is not None: - if not b.mgr_locs.isin(filter_locs).any(): - result_blocks.append(b) - continue if aligned_args: b_items = self.items[b.mgr_locs.indexer] for k, obj in aligned_args.items(): axis = obj._info_axis_number - kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy) + kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy)._values if callable(f): applied = b.apply(f, **kwargs) @@ -423,18 +413,18 @@ def apply(self, f, filter=None, **kwargs): if len(result_blocks) == 0: return self.make_empty(self.axes) - bm = type(self)(result_blocks, self.axes, do_integrity_check=False) - return bm + + return type(self).from_blocks(result_blocks, self.axes) def quantile( self, - axis=0, - consolidate=True, - transposed=False, + axis: int = 0, + consolidate: bool = True, + transposed: bool = False, interpolation="linear", qs=None, numeric_only=None, - ): + ) -> "BlockManager": """ Iterate over blocks applying quantile reduction. This routine is intended for reduction type operations and @@ -453,7 +443,7 @@ def quantile( Returns ------- - Block Manager (new object) + BlockManager """ # Series dispatches to DataFrame for quantile, which allows us to # simplify some of the code here and in the blocks @@ -526,49 +516,101 @@ def get_axe(block, qs, axes): values = values.take(indexer) return SingleBlockManager( - make_block(values, ndim=1, placement=np.arange(len(values))), - axes[0], - fastpath=True, + make_block(values, ndim=1, placement=np.arange(len(values))), axes[0], ) - def isna(self, func): + def isna(self, func) -> "BlockManager": return self.apply("apply", func=func) - def where(self, **kwargs): - return self.apply("where", **kwargs) + def where( + self, other, cond, align: bool, errors: str, try_cast: bool, axis: int + ) -> "BlockManager": + if align: + align_keys = ["other", "cond"] + else: + align_keys = ["cond"] + other = extract_array(other, extract_numpy=True) + + return self.apply( + "where", + align_keys=align_keys, + other=other, + cond=cond, + errors=errors, + try_cast=try_cast, + axis=axis, + ) + + def setitem(self, indexer, value) -> "BlockManager": + return self.apply("setitem", indexer=indexer, value=value) - def setitem(self, **kwargs): - return self.apply("setitem", **kwargs) + def putmask( + self, mask, new, align: bool = True, axis: int = 0, + ): + transpose = self.ndim == 2 - def putmask(self, **kwargs): - return self.apply("putmask", **kwargs) + if align: + align_keys = ["new", "mask"] + else: + align_keys = ["mask"] + new = extract_array(new, extract_numpy=True) + + return self.apply( + "putmask", + align_keys=align_keys, + mask=mask, + new=new, + inplace=True, + axis=axis, + transpose=transpose, + ) - def diff(self, **kwargs): - return self.apply("diff", **kwargs) + def diff(self, n: int, axis: int) -> "BlockManager": + return self.apply("diff", n=n, axis=axis) - def interpolate(self, **kwargs): + def interpolate(self, **kwargs) -> "BlockManager": return self.apply("interpolate", **kwargs) - def shift(self, **kwargs): - return self.apply("shift", **kwargs) + def shift(self, periods: int, axis: int, fill_value) -> "BlockManager": + return self.apply("shift", periods=periods, axis=axis, fill_value=fill_value) - def fillna(self, **kwargs): - return self.apply("fillna", **kwargs) + def fillna(self, value, limit, inplace: bool, downcast) -> "BlockManager": + return self.apply( + "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast + ) - def downcast(self, **kwargs): - return self.apply("downcast", **kwargs) + def downcast(self) -> "BlockManager": + return self.apply("downcast") - def astype(self, dtype, copy: bool = False, errors: str = "raise"): + def astype( + self, dtype, copy: bool = False, errors: str = "raise" + ) -> "BlockManager": return self.apply("astype", dtype=dtype, copy=copy, errors=errors) - def convert(self, **kwargs): - return self.apply("convert", **kwargs) + def convert( + self, + copy: bool = True, + datetime: bool = True, + numeric: bool = True, + timedelta: bool = True, + coerce: bool = False, + ) -> "BlockManager": + return self.apply( + "convert", + copy=copy, + datetime=datetime, + numeric=numeric, + timedelta=timedelta, + coerce=coerce, + ) - def replace(self, value, **kwargs): + def replace(self, value, **kwargs) -> "BlockManager": assert np.ndim(value) == 0, value return self.apply("replace", value=value, **kwargs) - def replace_list(self, src_list, dest_list, inplace=False, regex=False): + def replace_list( + self, src_list, dest_list, inplace: bool = False, regex: bool = False + ) -> "BlockManager": """ do a list replace """ inplace = validate_bool_kwarg(inplace, "inplace") @@ -582,11 +624,8 @@ def comp(s, regex=False): """ if isna(s): return isna(values) - if isinstance(s, (Timedelta, Timestamp)) and getattr(s, "tz", None) is None: - return _compare_or_regex_search( - maybe_convert_objects(values), s.asm8, regex - ) + s = com.maybe_box_datetimelike(s) return _compare_or_regex_search(values, s, regex) masks = [comp(s, regex) for s in src_list] @@ -599,11 +638,10 @@ def comp(s, regex=False): # replace ALWAYS will return a list rb = [blk if inplace else blk.copy()] for i, (s, d) in enumerate(zip(src_list, dest_list)): - # TODO: assert/validate that `d` is always a scalar? - new_rb = [] + new_rb: List[Block] = [] for b in rb: m = masks[i][b.mgr_locs.indexer] - convert = i == src_len + convert = i == src_len # only convert once at the end result = b._replace_coerce( mask=m, to_replace=s, @@ -619,11 +657,11 @@ def comp(s, regex=False): rb = new_rb result_blocks.extend(rb) - bm = type(self)(result_blocks, self.axes) + bm = type(self).from_blocks(result_blocks, self.axes) bm._consolidate_inplace() return bm - def is_consolidated(self): + def is_consolidated(self) -> bool: """ Return True if more than one block with the same dtype """ @@ -631,9 +669,9 @@ def is_consolidated(self): self._consolidate_check() return self._is_consolidated - def _consolidate_check(self): - ftypes = [blk.ftype for blk in self.blocks] - self._is_consolidated = len(ftypes) == len(set(ftypes)) + def _consolidate_check(self) -> None: + dtypes = [blk.dtype for blk in self.blocks if blk._can_consolidate] + self._is_consolidated = len(dtypes) == len(set(dtypes)) self._known_consolidated = True @property @@ -668,7 +706,7 @@ def is_view(self) -> bool: return False - def get_bool_data(self, copy: bool = False): + def get_bool_data(self, copy: bool = False) -> "BlockManager": """ Parameters ---------- @@ -676,9 +714,9 @@ def get_bool_data(self, copy: bool = False): Whether to copy the blocks """ self._consolidate_inplace() - return self.combine([b for b in self.blocks if b.is_bool], copy) + return self._combine([b for b in self.blocks if b.is_bool], copy) - def get_numeric_data(self, copy: bool = False): + def get_numeric_data(self, copy: bool = False) -> "BlockManager": """ Parameters ---------- @@ -686,9 +724,9 @@ def get_numeric_data(self, copy: bool = False): Whether to copy the blocks """ self._consolidate_inplace() - return self.combine([b for b in self.blocks if b.is_numeric], copy) + return self._combine([b for b in self.blocks if b.is_numeric], copy) - def combine(self, blocks, copy=True): + def _combine(self, blocks: List[Block], copy: bool = True) -> "BlockManager": """ return a new manager with the blocks """ if len(blocks) == 0: return self.make_empty() @@ -700,27 +738,23 @@ def combine(self, blocks, copy=True): new_blocks = [] for b in blocks: b = b.copy(deep=copy) - b.mgr_locs = algos.take_1d( - inv_indexer, b.mgr_locs.as_array, axis=0, allow_fill=False - ) + b.mgr_locs = inv_indexer[b.mgr_locs.indexer] new_blocks.append(b) axes = list(self.axes) axes[0] = self.items.take(indexer) - return type(self)(new_blocks, axes, do_integrity_check=False) + return type(self).from_blocks(new_blocks, axes) - def get_slice(self, slobj: slice, axis: int = 0): - if axis >= self.ndim: - raise IndexError("Requested axis not found in manager") + def get_slice(self, slobj: slice, axis: int = 0) -> "BlockManager": if axis == 0: new_blocks = self._slice_take_blocks_ax0(slobj) - else: - _slicer = [slice(None)] * (axis + 1) - _slicer[axis] = slobj - slicer = tuple(_slicer) + elif axis == 1: + slicer = (slice(None), slobj) new_blocks = [blk.getitem_block(slicer) for blk in self.blocks] + else: + raise IndexError("Requested axis not found in manager") new_axes = list(self.axes) new_axes[axis] = new_axes[axis][slobj] @@ -728,14 +762,11 @@ def get_slice(self, slobj: slice, axis: int = 0): bm = type(self)(new_blocks, new_axes, do_integrity_check=False) return bm - def __contains__(self, item) -> bool: - return item in self.items - @property def nblocks(self) -> int: return len(self.blocks) - def copy(self, deep=True): + def copy(self: T, deep=True) -> T: """ Make deep or shallow copy of BlockManager @@ -781,21 +812,19 @@ def as_array(self, transpose: bool = False) -> np.ndarray: arr = np.empty(self.shape, dtype=float) return arr.transpose() if transpose else arr - mgr = self - - if self._is_single_block and mgr.blocks[0].is_datetimetz: + if self._is_single_block and self.blocks[0].is_datetimetz: # TODO(Block.get_values): Make DatetimeTZBlock.get_values # always be object dtype. Some callers seem to want the # DatetimeArray (previously DTI) - arr = mgr.blocks[0].get_values(dtype=object) + arr = self.blocks[0].get_values(dtype=object) elif self._is_single_block or not self.is_mixed_type: - arr = np.asarray(mgr.blocks[0].get_values()) + arr = np.asarray(self.blocks[0].get_values()) else: - arr = mgr._interleave() + arr = self._interleave() return arr.transpose() if transpose else arr - def _interleave(self): + def _interleave(self) -> np.ndarray: """ Return ndarray from blocks with specified item order Items must be contained in the blocks @@ -804,7 +833,7 @@ def _interleave(self): # TODO: https://github.com/pandas-dev/pandas/issues/22791 # Give EAs some input on what happens here. Sparse needs this. - if is_sparse(dtype): + if isinstance(dtype, SparseDtype): dtype = dtype.subtype elif is_extension_array_dtype(dtype): dtype = "object" @@ -845,32 +874,27 @@ def to_dict(self, copy: bool = True): for b in self.blocks: bd.setdefault(str(b.dtype), []).append(b) - return {dtype: self.combine(blocks, copy=copy) for dtype, blocks in bd.items()} + # TODO(EA2D): the combine will be unnecessary with 2D EAs + return {dtype: self._combine(blocks, copy=copy) for dtype, blocks in bd.items()} - def fast_xs(self, loc: int): + def fast_xs(self, loc: int) -> ArrayLike: """ - get a cross sectional for a given location in the - items ; handle dups + Return the array corresponding to `frame.iloc[loc]`. + + Parameters + ---------- + loc : int - return the result, is *could* be a view in the case of a - single block + Returns + ------- + np.ndarray or ExtensionArray """ if len(self.blocks) == 1: return self.blocks[0].iget((slice(None), loc)) - items = self.items - - # non-unique (GH4726) - if not items.is_unique: - result = self._interleave() - if self.ndim == 2: - result = result.T - return result[loc] - - # unique dtype = _interleaved_dtype(self.blocks) - n = len(items) + n = len(self) if is_extension_array_dtype(dtype): # we'll eventually construct an ExtensionArray. result = np.empty(n, dtype=object) @@ -904,7 +928,7 @@ def consolidate(self) -> "BlockManager": bm._consolidate_inplace() return bm - def _consolidate_inplace(self): + def _consolidate_inplace(self) -> None: if not self.is_consolidated(): self.blocks = tuple(_consolidate(self.blocks)) self._is_consolidated = True @@ -944,8 +968,8 @@ def iget(self, i: int) -> "SingleBlockManager": """ Return the data as a SingleBlockManager. """ - block = self.blocks[self._blknos[i]] - values = block.iget(self._blklocs[i]) + block = self.blocks[self.blknos[i]] + values = block.iget(self.blklocs[i]) # shortcut for select a single-dim from a 2-dim BM return SingleBlockManager( @@ -953,15 +977,20 @@ def iget(self, i: int) -> "SingleBlockManager": values, placement=slice(0, len(values)), ndim=1 ), self.axes[1], - fastpath=True, ) - def delete(self, item): + def iget_values(self, i: int) -> ArrayLike: """ - Delete selected item (items if non-unique) in-place. + Return the data for column i as the values (ndarray or ExtensionArray). """ - indexer = self.items.get_loc(item) + block = self.blocks[self.blknos[i]] + values = block.iget(self.blklocs[i]) + return values + def idelete(self, indexer): + """ + Delete selected locations in-place (new block and array, same BlockManager) + """ is_deleted = np.zeros(self.shape[0], dtype=np.bool_) is_deleted[indexer] = True ref_loc_offset = -is_deleted.cumsum() @@ -973,7 +1002,7 @@ def delete(self, item): else: affected_start = is_deleted.nonzero()[0][0] - for blkno, _ in _fast_count_smallints(self._blknos[affected_start:]): + for blkno, _ in _fast_count_smallints(self.blknos[affected_start:]): blk = self.blocks[blkno] bml = blk.mgr_locs blk_del = is_deleted[bml.indexer].nonzero()[0] @@ -992,16 +1021,35 @@ def delete(self, item): self.blocks = tuple( b for blkno, b in enumerate(self.blocks) if not is_blk_deleted[blkno] ) - self._shape = None self._rebuild_blknos_and_blklocs() - def set(self, item, value): + def set(self, item: Label, value): + """ + Set new item in-place. + + Notes + ----- + Does not consolidate. + Adds new Block if not contained in the current items Index. + """ + try: + loc = self.items.get_loc(item) + except KeyError: + # This item wasn't present, just insert at end + self.insert(len(self.items), item, value) + return + + self.iset(loc, value) + + def iset(self, loc: Union[int, slice, np.ndarray], value): """ Set new item in-place. Does not consolidate. Adds new Block if not contained in the current set of items """ # FIXME: refactor, clearly separate broadcasting & zip-like assignment # can prob also fix the various if tests for sparse/categorical + if self._blklocs is None and self.ndim > 1: + self._rebuild_blknos_and_blklocs() value_is_extension_type = is_extension_array_dtype(value) @@ -1028,18 +1076,15 @@ def value_getitem(placement): "Shape of new values must be compatible with manager shape" ) - try: - loc = self.items.get_loc(item) - except KeyError: - # This item wasn't present, just insert at end - self.insert(len(self.items), item, value) - return - - if isinstance(loc, int): + if lib.is_integer(loc): + # We have 6 tests where loc is _not_ an int. + # In this case, get_blkno_placements will yield only one tuple, + # containing (self._blknos[loc], BlockPlacement(slice(0, 1, 1))) loc = [loc] - blknos = self._blknos[loc] - blklocs = self._blklocs[loc].copy() + # Accessing public blknos ensures the public versions are initialized + blknos = self.blknos[loc] + blklocs = self.blklocs[loc].copy() unfit_mgr_locs = [] unfit_val_locs = [] @@ -1057,7 +1102,6 @@ def value_getitem(placement): if len(val_locs) == len(blk.mgr_locs): removed_blknos.append(blkno) else: - self._blklocs[blk.mgr_locs.indexer] = -1 blk.delete(blk_locs) self._blklocs[blk.mgr_locs.indexer] = np.arange(len(blk)) @@ -1069,9 +1113,7 @@ def value_getitem(placement): new_blknos = np.empty(self.nblocks, dtype=np.int64) new_blknos.fill(-1) new_blknos[~is_deleted] = np.arange(self.nblocks - len(removed_blknos)) - self._blknos = algos.take_1d( - new_blknos, self._blknos, axis=0, allow_fill=False - ) + self._blknos = new_blknos[self._blknos] self.blocks = tuple( blk for i, blk in enumerate(self.blocks) if i not in set(removed_blknos) ) @@ -1080,13 +1122,14 @@ def value_getitem(placement): unfit_mgr_locs = np.concatenate(unfit_mgr_locs) unfit_count = len(unfit_mgr_locs) - new_blocks = [] + new_blocks: List[Block] = [] if value_is_extension_type: - # This code (ab-)uses the fact that sparse blocks contain only + # This code (ab-)uses the fact that EA blocks contain only # one item. + # TODO(EA2D): special casing unnecessary with 2D EAs new_blocks.extend( make_block( - values=value.copy(), + values=value, ndim=self.ndim, placement=slice(mgr_loc, mgr_loc + 1), ) @@ -1116,7 +1159,7 @@ def value_getitem(placement): # Newly created block's dtype may already be present. self._known_consolidated = False - def insert(self, loc: int, item, value, allow_duplicates: bool = False): + def insert(self, loc: int, item: Label, value, allow_duplicates: bool = False): """ Insert item at selected position. @@ -1139,9 +1182,12 @@ def insert(self, loc: int, item, value, allow_duplicates: bool = False): # insert to the axis; this could possibly raise a TypeError new_axis = self.items.insert(loc, item) + if value.ndim == self.ndim - 1 and not is_extension_array_dtype(value.dtype): + value = _safe_reshape(value, (1,) + value.shape) + block = make_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1)) - for blkno, count in _fast_count_smallints(self._blknos[loc:]): + for blkno, count in _fast_count_smallints(self.blknos[loc:]): blk = self.blocks[blkno] if count == len(blk.mgr_locs): blk.mgr_locs = blk.mgr_locs.add(1) @@ -1150,7 +1196,8 @@ def insert(self, loc: int, item, value, allow_duplicates: bool = False): new_mgr_locs[new_mgr_locs >= loc] += 1 blk.mgr_locs = new_mgr_locs - if loc == self._blklocs.shape[0]: + # Accessing public blklocs ensures the public versions are initialized + if loc == self.blklocs.shape[0]: # np.append is a lot faster, let's use it if we can. self._blklocs = np.append(self._blklocs, 0) self._blknos = np.append(self._blknos, len(self.blocks)) @@ -1160,7 +1207,6 @@ def insert(self, loc: int, item, value, allow_duplicates: bool = False): self.axes[0] = new_axis self.blocks += (block,) - self._shape = None self._known_consolidated = False @@ -1168,7 +1214,13 @@ def insert(self, loc: int, item, value, allow_duplicates: bool = False): self._consolidate_inplace() def reindex_axis( - self, new_index, axis, method=None, limit=None, fill_value=None, copy=True + self, + new_index, + axis: int, + method=None, + limit=None, + fill_value=None, + copy: bool = True, ): """ Conform block manager to new index. @@ -1183,16 +1235,24 @@ def reindex_axis( ) def reindex_indexer( - self, new_axis, indexer, axis, fill_value=None, allow_dups=False, copy=True - ): + self: T, + new_axis, + indexer, + axis: int, + fill_value=None, + allow_dups: bool = False, + copy: bool = True, + ) -> T: """ Parameters ---------- new_axis : Index indexer : ndarray of int64 or None axis : int - fill_value : object - allow_dups : bool + fill_value : object, default None + allow_dups : bool, default False + copy : bool, default True + pandas-indexer with -1's only. """ @@ -1215,14 +1275,14 @@ def reindex_indexer( raise IndexError("Requested axis not found in manager") if axis == 0: - new_blocks = self._slice_take_blocks_ax0(indexer, fill_tuple=(fill_value,)) + new_blocks = self._slice_take_blocks_ax0(indexer, fill_value=fill_value) else: new_blocks = [ blk.take_nd( indexer, axis=axis, - fill_tuple=( - fill_value if fill_value is not None else blk.fill_value, + fill_value=( + fill_value if fill_value is not None else blk.fill_value ), ) for blk in self.blocks @@ -1230,9 +1290,10 @@ def reindex_indexer( new_axes = list(self.axes) new_axes[axis] = new_axis - return type(self)(new_blocks, new_axes) - def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): + return type(self).from_blocks(new_blocks, new_axes) + + def _slice_take_blocks_ax0(self, slice_or_indexer, fill_value=lib.no_default): """ Slice/take blocks along axis=0. @@ -1242,7 +1303,7 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): ------- new_blocks : list of Block """ - allow_fill = fill_tuple is not None + allow_fill = fill_value is not lib.no_default sl_type, slobj, sllen = _preprocess_slice_or_indexer( slice_or_indexer, self.shape[0], allow_fill=allow_fill @@ -1252,30 +1313,33 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): blk = self.blocks[0] if sl_type in ("slice", "mask"): + # GH#32959 EABlock would fail since we cant make 0-width + # TODO(EA2D): special casing unnecessary with 2D EAs + if sllen == 0: + return [] return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))] elif not allow_fill or self.ndim == 1: - if allow_fill and fill_tuple[0] is None: + if allow_fill and fill_value is None: _, fill_value = maybe_promote(blk.dtype) - fill_tuple = (fill_value,) return [ blk.take_nd( slobj, axis=0, new_mgr_locs=slice(0, sllen), - fill_tuple=fill_tuple, + fill_value=fill_value, ) ] if sl_type in ("slice", "mask"): - blknos = self._blknos[slobj] - blklocs = self._blklocs[slobj] + blknos = self.blknos[slobj] + blklocs = self.blklocs[slobj] else: blknos = algos.take_1d( - self._blknos, slobj, fill_value=-1, allow_fill=allow_fill + self.blknos, slobj, fill_value=-1, allow_fill=allow_fill ) blklocs = algos.take_1d( - self._blklocs, slobj, fill_value=-1, allow_fill=allow_fill + self.blklocs, slobj, fill_value=-1, allow_fill=allow_fill ) # When filling blknos, make sure blknos is updated before appending to @@ -1286,8 +1350,7 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): blocks = [] for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=True): if blkno == -1: - # If we've got here, fill_tuple was not None. - fill_value = fill_tuple[0] + # If we've got here, fill_value was not lib.no_default blocks.append( self._make_na_block(placement=mgr_locs, fill_value=fill_value) @@ -1308,10 +1371,7 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): else: blocks.append( blk.take_nd( - blklocs[mgr_locs.indexer], - axis=0, - new_mgr_locs=mgr_locs, - fill_tuple=None, + blklocs[mgr_locs.indexer], axis=0, new_mgr_locs=mgr_locs, ) ) @@ -1329,7 +1389,7 @@ def _make_na_block(self, placement, fill_value=None): block_values.fill(fill_value) return make_block(block_values, placement=placement) - def take(self, indexer, axis=1, verify=True, convert=True): + def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True): """ Take items along any axis. """ @@ -1377,14 +1437,13 @@ def canonicalize(block): block.equals(oblock) for block, oblock in zip(self_blocks, other_blocks) ) - def unstack(self, unstacker_func, fill_value) -> "BlockManager": + def unstack(self, unstacker, fill_value) -> "BlockManager": """ Return a BlockManager with all blocks unstacked.. Parameters ---------- - unstacker_func : callable - A (partially-applied) ``pd.core.reshape._Unstacker`` class. + unstacker : reshape._Unstacker fill_value : Any fill_value for newly introduced missing values. @@ -1392,19 +1451,19 @@ def unstack(self, unstacker_func, fill_value) -> "BlockManager": ------- unstacked : BlockManager """ - n_rows = self.shape[-1] - dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items) - new_columns = dummy.get_new_columns() - new_index = dummy.get_new_index() + new_columns = unstacker.get_new_columns(self.items) + new_index = unstacker.new_index + new_blocks: List[Block] = [] columns_mask: List[np.ndarray] = [] for blk in self.blocks: + blk_cols = self.items[blk.mgr_locs.indexer] + new_items = unstacker.get_new_columns(blk_cols) + new_placement = new_columns.get_indexer(new_items) + blocks, mask = blk._unstack( - partial(unstacker_func, value_columns=self.items[blk.mgr_locs.indexer]), - new_columns, - n_rows, - fill_value, + unstacker, fill_value, new_placement=new_placement ) new_blocks.extend(blocks) @@ -1427,52 +1486,42 @@ class SingleBlockManager(BlockManager): def __init__( self, block: Block, - axis: Union[Index, List[Index]], + axis: Index, do_integrity_check: bool = False, - fastpath: bool = False, + fastpath=lib.no_default, ): - if isinstance(axis, list): - if len(axis) != 1: - raise ValueError( - "cannot create SingleBlockManager with more than 1 axis" - ) - axis = axis[0] - - # passed from constructor, single block, single axis - if fastpath: - self.axes = [axis] - if isinstance(block, list): - - # empty block - if len(block) == 0: - block = [np.array([])] - elif len(block) != 1: - raise ValueError( - "Cannot create SingleBlockManager with more than 1 block" - ) - block = block[0] - else: - self.axes = [ensure_index(axis)] - - # create the block here - if isinstance(block, list): - - # provide consolidation to the interleaved_dtype - if len(block) > 1: - dtype = _interleaved_dtype(block) - block = [b.astype(dtype) for b in block] - block = _consolidate(block) + assert isinstance(block, Block), type(block) + assert isinstance(axis, Index), type(axis) + + if fastpath is not lib.no_default: + warnings.warn( + "The `fastpath` keyword is deprecated and will be removed " + "in a future version.", + FutureWarning, + stacklevel=2, + ) - if len(block) != 1: - raise ValueError( - "Cannot create SingleBlockManager with more than 1 block" - ) - block = block[0] + self.axes = [axis] + self.blocks = tuple([block]) - if not isinstance(block, Block): - block = make_block(block, placement=slice(0, len(axis)), ndim=1) + @classmethod + def from_blocks( + cls, blocks: List[Block], axes: List[Index] + ) -> "SingleBlockManager": + """ + Constructor for BlockManager and SingleBlockManager with same signature. + """ + assert len(blocks) == 1 + assert len(axes) == 1 + return cls(blocks[0], axes[0], do_integrity_check=False) - self.blocks = tuple([block]) + @classmethod + def from_array(cls, array: ArrayLike, index: Index) -> "SingleBlockManager": + """ + Constructor for if we have an array that is not yet a Block. + """ + block = make_block(array, placement=slice(0, len(index)), ndim=1) + return cls(block, index) def _post_setstate(self): pass @@ -1481,10 +1530,6 @@ def _post_setstate(self): def _block(self) -> Block: return self.blocks[0] - @property - def _values(self): - return self._block.values - @property def _blknos(self): """ compat with BlockManager """ @@ -1499,20 +1544,19 @@ def get_slice(self, slobj: slice, axis: int = 0) -> "SingleBlockManager": if axis >= self.ndim: raise IndexError("Requested axis not found in manager") - return type(self)(self._block._slice(slobj), self.index[slobj], fastpath=True) + blk = self._block + array = blk._slice(slobj) + block = blk.make_block_same_class(array, placement=slice(0, len(array))) + return type(self)(block, self.index[slobj]) @property def index(self) -> Index: return self.axes[0] @property - def dtype(self): + def dtype(self) -> DtypeObj: return self._block.dtype - @property - def array_dtype(self): - return self._block.array_dtype - def get_dtype_counts(self): return {self.dtype.name: 1} @@ -1527,10 +1571,6 @@ def internal_values(self): """The array that Series._values returns""" return self._block.internal_values() - def get_values(self) -> np.ndarray: - """ return a dense type view """ - return np.array(self._block.to_dense(), copy=False) - @property def _can_hold_na(self) -> bool: return self._block._can_hold_na @@ -1544,15 +1584,14 @@ def _consolidate_check(self): def _consolidate_inplace(self): pass - def delete(self, item): + def idelete(self, indexer): """ - Delete single item from SingleBlockManager. + Delete single location from SingleBlockManager. Ensures that self.blocks doesn't become empty. """ - loc = self.items.get_loc(item) - self._block.delete(loc) - self.axes[0] = self.axes[0].delete(loc) + self._block.delete(indexer) + self.axes[0] = self.axes[0].delete(indexer) def fast_xs(self, loc): """ @@ -1561,47 +1600,12 @@ def fast_xs(self, loc): """ raise NotImplementedError("Use series._values[loc] instead") - def concat(self, to_concat, new_axis) -> "SingleBlockManager": - """ - Concatenate a list of SingleBlockManagers into a single - SingleBlockManager. - - Used for pd.concat of Series objects with axis=0. - - Parameters - ---------- - to_concat : list of SingleBlockManagers - new_axis : Index of the result - - Returns - ------- - SingleBlockManager - """ - non_empties = [x for x in to_concat if len(x) > 0] - - # check if all series are of the same block type: - if len(non_empties) > 0: - blocks = [obj.blocks[0] for obj in non_empties] - if len({b.dtype for b in blocks}) == 1: - new_block = blocks[0].concat_same_type(blocks) - else: - values = [x.values for x in blocks] - values = concat_compat(values) - new_block = make_block(values, placement=slice(0, len(values), 1)) - else: - values = [x._block.values for x in to_concat] - values = concat_compat(values) - new_block = make_block(values, placement=slice(0, len(values), 1)) - - mgr = SingleBlockManager(new_block, new_axis) - return mgr - # -------------------------------------------------------------------- # Constructor Helpers -def create_block_manager_from_blocks(blocks, axes): +def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager: try: if len(blocks) == 1 and not isinstance(blocks[0], Block): # if blocks[0] is of length 0, return empty blocks @@ -1622,10 +1626,15 @@ def create_block_manager_from_blocks(blocks, axes): except ValueError as e: blocks = [getattr(b, "values", b) for b in blocks] tot_items = sum(b.shape[0] for b in blocks) - construction_error(tot_items, blocks[0].shape[1:], axes, e) + raise construction_error(tot_items, blocks[0].shape[1:], axes, e) -def create_block_manager_from_arrays(arrays, names, axes): +def create_block_manager_from_arrays( + arrays, names: Index, axes: List[Index] +) -> BlockManager: + assert isinstance(names, Index) + assert isinstance(axes, list) + assert all(isinstance(x, Index) for x in axes) try: blocks = form_blocks(arrays, names, axes) @@ -1633,7 +1642,7 @@ def create_block_manager_from_arrays(arrays, names, axes): mgr._consolidate_inplace() return mgr except ValueError as e: - construction_error(len(arrays), arrays[0].shape, axes, e) + raise construction_error(len(arrays), arrays[0].shape, axes, e) def construction_error(tot_items, block_shape, axes, e=None): @@ -1648,23 +1657,25 @@ def construction_error(tot_items, block_shape, axes, e=None): if len(implied) <= 2: implied = implied[::-1] + # We return the exception object instead of raising it so that we + # can raise it in the caller; mypy plays better with that if passed == implied and e is not None: - raise e + return e if block_shape[0] == 0: - raise ValueError("Empty data passed with indices specified.") - raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}") + return ValueError("Empty data passed with indices specified.") + return ValueError(f"Shape of passed values is {passed}, indices imply {implied}") # ----------------------------------------------------------------------- -def form_blocks(arrays, names, axes): +def form_blocks(arrays, names: Index, axes) -> List[Block]: # put "leftover" items in float bucket, where else? # generalize? - items_dict = defaultdict(list) + items_dict: DefaultDict[str, List] = defaultdict(list) extra_locs = [] - names_idx = ensure_index(names) + names_idx = names if names_idx.equals(axes[0]): names_indexer = np.arange(len(names_idx)) else: @@ -1682,7 +1693,7 @@ def form_blocks(arrays, names, axes): block_type = get_block_type(v) items_dict[block_type.__name__].append((i, k, v)) - blocks = [] + blocks: List[Block] = [] if len(items_dict["FloatBlock"]): float_blocks = _multi_blockify(items_dict["FloatBlock"]) blocks.extend(float_blocks) @@ -1700,12 +1711,12 @@ def form_blocks(arrays, names, axes): blocks.extend(int_blocks) if len(items_dict["DatetimeBlock"]): - datetime_blocks = _simple_blockify(items_dict["DatetimeBlock"], _NS_DTYPE) + datetime_blocks = _simple_blockify(items_dict["DatetimeBlock"], DT64NS_DTYPE) blocks.extend(datetime_blocks) if len(items_dict["DatetimeTZBlock"]): dttz_blocks = [ - make_block(array, klass=DatetimeTZBlock, placement=[i]) + make_block(array, klass=DatetimeTZBlock, placement=i) for i, _, array in items_dict["DatetimeTZBlock"] ] blocks.extend(dttz_blocks) @@ -1720,7 +1731,7 @@ def form_blocks(arrays, names, axes): if len(items_dict["CategoricalBlock"]) > 0: cat_blocks = [ - make_block(array, klass=CategoricalBlock, placement=[i]) + make_block(array, klass=CategoricalBlock, placement=i) for i, _, array in items_dict["CategoricalBlock"] ] blocks.extend(cat_blocks) @@ -1728,7 +1739,7 @@ def form_blocks(arrays, names, axes): if len(items_dict["ExtensionBlock"]): external_blocks = [ - make_block(array, klass=ExtensionBlock, placement=[i]) + make_block(array, klass=ExtensionBlock, placement=i) for i, _, array in items_dict["ExtensionBlock"] ] @@ -1736,7 +1747,7 @@ def form_blocks(arrays, names, axes): if len(items_dict["ObjectValuesExtensionBlock"]): external_blocks = [ - make_block(array, klass=ObjectValuesExtensionBlock, placement=[i]) + make_block(array, klass=ObjectValuesExtensionBlock, placement=i) for i, _, array in items_dict["ObjectValuesExtensionBlock"] ] @@ -1843,13 +1854,43 @@ def _consolidate(blocks): new_blocks = [] for (_can_consolidate, dtype), group_blocks in grouper: merged_blocks = _merge_blocks( - list(group_blocks), dtype=dtype, _can_consolidate=_can_consolidate + list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate ) new_blocks = _extend_blocks(merged_blocks, new_blocks) return new_blocks -def _compare_or_regex_search(a, b, regex=False): +def _merge_blocks( + blocks: List[Block], dtype: DtypeObj, can_consolidate: bool +) -> List[Block]: + + if len(blocks) == 1: + return blocks + + if can_consolidate: + + if dtype is None: + if len({b.dtype for b in blocks}) != 1: + raise AssertionError("_merge_blocks are invalid!") + + # TODO: optimization potential in case all mgrs contain slices and + # combination of those slices is a slice, too. + new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks]) + new_values = np.vstack([b.values for b in blocks]) + + argsort = np.argsort(new_mgr_locs) + new_values = new_values[argsort] + new_mgr_locs = new_mgr_locs[argsort] + + return [make_block(new_values, placement=new_mgr_locs)] + + # can't consolidate --> no merge + return blocks + + +def _compare_or_regex_search( + a: Union[ArrayLike, Scalar], b: Union[ArrayLike, Scalar], regex: bool = False +) -> Union[ArrayLike, bool]: """ Compare two array_like inputs of the same shape or two scalar values @@ -1866,67 +1907,77 @@ def _compare_or_regex_search(a, b, regex=False): ------- mask : array_like of bool """ + + def _check_comparison_types( + result: Union[ArrayLike, bool], + a: Union[ArrayLike, Scalar], + b: Union[ArrayLike, Scalar], + ) -> Union[ArrayLike, bool]: + """ + Raises an error if the two arrays (a,b) cannot be compared. + Otherwise, returns the comparison result as expected. + """ + if is_scalar(result) and ( + isinstance(a, np.ndarray) or isinstance(b, np.ndarray) + ): + type_names = [type(a).__name__, type(b).__name__] + + if isinstance(a, np.ndarray): + type_names[0] = f"ndarray(dtype={a.dtype})" + + if isinstance(b, np.ndarray): + type_names[1] = f"ndarray(dtype={b.dtype})" + + raise TypeError( + f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" + ) + return result + if not regex: op = lambda x: operator.eq(x, b) else: op = np.vectorize( - lambda x: bool(re.search(b, x)) if isinstance(x, str) else False + lambda x: bool(re.search(b, x)) + if isinstance(x, str) and isinstance(b, str) + else False ) - is_a_array = isinstance(a, np.ndarray) - is_b_array = isinstance(b, np.ndarray) + # GH#32621 use mask to avoid comparing to NAs + if isinstance(a, np.ndarray) and not isinstance(b, np.ndarray): + mask = np.reshape(~(isna(a)), a.shape) + elif isinstance(b, np.ndarray) and not isinstance(a, np.ndarray): + mask = np.reshape(~(isna(b)), b.shape) + elif isinstance(a, np.ndarray) and isinstance(b, np.ndarray): + mask = ~(isna(a) | isna(b)) + if isinstance(a, np.ndarray): + a = a[mask] + if isinstance(b, np.ndarray): + b = b[mask] if is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b): # GH#29553 avoid deprecation warnings from numpy - result = False - else: - result = op(a) + return _check_comparison_types(False, a, b) - if is_scalar(result) and (is_a_array or is_b_array): - type_names = [type(a).__name__, type(b).__name__] + result = op(a) - if is_a_array: - type_names[0] = f"ndarray(dtype={a.dtype})" + if isinstance(result, np.ndarray): + # The shape of the mask can differ to that of the result + # since we may compare only a subset of a's or b's elements + tmp = np.zeros(mask.shape, dtype=np.bool) + tmp[mask] = result + result = tmp - if is_b_array: - type_names[1] = f"ndarray(dtype={b.dtype})" - - raise TypeError( - f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" - ) - return result - - -def _transform_index(index, func, level=None): - """ - Apply function to all values found in index. + return _check_comparison_types(result, a, b) - This includes transforming multiindex entries separately. - Only apply function to one level of the MultiIndex if level is specified. - """ - if isinstance(index, MultiIndex): - if level is not None: - items = [ - tuple(func(y) if i == level else y for i, y in enumerate(x)) - for x in index - ] - else: - items = [tuple(func(y) for y in x) for x in index] - return MultiIndex.from_tuples(items, names=index.names) - else: - items = [func(x) for x in index] - return Index(items, name=index.name, tupleize_cols=False) - - -def _fast_count_smallints(arr): +def _fast_count_smallints(arr: np.ndarray) -> np.ndarray: """Faster version of set(arr) for sequences of small numbers.""" counts = np.bincount(arr.astype(np.int_)) nz = counts.nonzero()[0] return np.c_[nz, counts[nz]] -def _preprocess_slice_or_indexer(slice_or_indexer, length, allow_fill): +def _preprocess_slice_or_indexer(slice_or_indexer, length: int, allow_fill: bool): if isinstance(slice_or_indexer, slice): return ( "slice", @@ -1942,45 +1993,3 @@ def _preprocess_slice_or_indexer(slice_or_indexer, length, allow_fill): if not allow_fill: indexer = maybe_convert_indices(indexer, length) return "fancy", indexer, len(indexer) - - -def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): - """ - Concatenate block managers into one. - - Parameters - ---------- - mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples - axes : list of Index - concat_axis : int - copy : bool - - """ - concat_plans = [ - get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers - ] - concat_plan = combine_concat_plans(concat_plans, concat_axis) - blocks = [] - - for placement, join_units in concat_plan: - - if len(join_units) == 1 and not join_units[0].indexers: - b = join_units[0].block - values = b.values - if copy: - values = values.copy() - else: - values = values.view() - b = b.make_block_same_class(values, placement=placement) - elif is_uniform_join_units(join_units): - b = join_units[0].block.concat_same_type( - [ju.block for ju in join_units], placement=placement - ) - else: - b = make_block( - concatenate_join_units(join_units, concat_axis, copy=copy), - placement=placement, - ) - blocks.append(b) - - return BlockManager(blocks, axes) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index b30a7a24f3495..c46aed999f45a 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -343,10 +343,10 @@ def _interpolate_scipy_wrapper( if method == "pchip": try: alt_methods["pchip"] = interpolate.pchip_interpolate - except AttributeError: + except AttributeError as err: raise ImportError( "Your version of Scipy does not support PCHIP interpolation." - ) + ) from err elif method == "akima": alt_methods["akima"] = _akima_interpolate diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index a5c609473760d..9494248a423a8 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -7,8 +7,8 @@ from pandas._config import get_option -from pandas._libs import NaT, Timedelta, Timestamp, iNaT, lib -from pandas._typing import Dtype, Scalar +from pandas._libs import NaT, Period, Timedelta, Timestamp, iNaT, lib +from pandas._typing import ArrayLike, Dtype, Scalar from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask @@ -17,9 +17,7 @@ is_any_int_dtype, is_bool_dtype, is_complex, - is_datetime64_dtype, - is_datetime64tz_dtype, - is_datetime_or_timedelta_dtype, + is_datetime64_any_dtype, is_float, is_float_dtype, is_integer, @@ -28,10 +26,14 @@ is_object_dtype, is_scalar, is_timedelta64_dtype, + needs_i8_conversion, pandas_dtype, ) +from pandas.core.dtypes.dtypes import PeriodDtype from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna +from pandas.core.construction import extract_array + bn = import_optional_dependency("bottleneck", raise_on_missing=False, on_version="warn") _BOTTLENECK_INSTALLED = bn is not None _USE_BOTTLENECK = False @@ -73,7 +75,7 @@ def _f(*args, **kwargs): # e.g. this is normally a disallowed function on # object arrays that contain strings if is_object_dtype(args[0]): - raise TypeError(e) + raise TypeError(e) from e raise return _f @@ -132,10 +134,8 @@ def f( def _bn_ok_dtype(dtype: Dtype, name: str) -> bool: - # Bottleneck chokes on datetime64 - if not is_object_dtype(dtype) and not ( - is_datetime_or_timedelta_dtype(dtype) or is_datetime64tz_dtype(dtype) - ): + # Bottleneck chokes on datetime64, PeriodDtype (or and EA) + if not is_object_dtype(dtype) and not needs_i8_conversion(dtype): # GH 15507 # bottleneck does not properly upcast during the sum @@ -281,23 +281,16 @@ def _get_values( # with scalar fill_value. This guarantee is important for the # maybe_upcast_putmask call below assert is_scalar(fill_value) + values = extract_array(values, extract_numpy=True) mask = _maybe_get_mask(values, skipna, mask) - if is_datetime64tz_dtype(values): - # lib.values_from_object returns M8[ns] dtype instead of tz-aware, - # so this case must be handled separately from the rest - dtype = values.dtype - values = getattr(values, "_values", values) - else: - values = lib.values_from_object(values) - dtype = values.dtype + dtype = values.dtype - if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values): + if needs_i8_conversion(values): # changing timedelta64/datetime64 to int64 needs to happen after # finding `mask` above - values = getattr(values, "asi8", values) - values = values.view(np.int64) + values = np.asarray(values.view("i8")) dtype_ok = _na_ok_dtype(dtype) @@ -311,7 +304,8 @@ def _get_values( if skipna and copy: values = values.copy() - if dtype_ok: + assert mask is not None # for mypy + if dtype_ok and mask.any(): np.putmask(values, mask, fill_value) # promote if needed @@ -329,13 +323,14 @@ def _get_values( def _na_ok_dtype(dtype) -> bool: - # TODO: what about datetime64tz? PeriodDtype? - return not issubclass(dtype.type, (np.integer, np.timedelta64, np.datetime64)) + if needs_i8_conversion(dtype): + return False + return not issubclass(dtype.type, np.integer) def _wrap_results(result, dtype: Dtype, fill_value=None): """ wrap our results if needed """ - if is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): + if is_datetime64_any_dtype(dtype): if fill_value is None: # GH#24293 fill_value = iNaT @@ -346,7 +341,8 @@ def _wrap_results(result, dtype: Dtype, fill_value=None): result = np.nan result = Timestamp(result, tz=tz) else: - result = result.view(dtype) + # If we have float dtype, taking a view will give the wrong result + result = result.astype(dtype) elif is_timedelta64_dtype(dtype): if not isinstance(result, np.ndarray): if result == fill_value: @@ -360,6 +356,14 @@ def _wrap_results(result, dtype: Dtype, fill_value=None): else: result = result.astype("m8[ns]").view(dtype) + elif isinstance(dtype, PeriodDtype): + if is_float(result) and result.is_integer(): + result = int(result) + if is_integer(result): + result = Period._from_ordinal(result, freq=dtype.freq) + else: + raise NotImplementedError(type(result), result) + return result @@ -546,12 +550,7 @@ def nanmean(values, axis=None, skipna=True, mask=None): ) dtype_sum = dtype_max dtype_count = np.float64 - if ( - is_integer_dtype(dtype) - or is_timedelta64_dtype(dtype) - or is_datetime64_dtype(dtype) - or is_datetime64tz_dtype(dtype) - ): + if is_integer_dtype(dtype) or needs_i8_conversion(dtype): dtype_sum = np.float64 elif is_float_dtype(dtype): dtype_sum = dtype @@ -607,9 +606,9 @@ def get_median(x): if not is_float_dtype(values.dtype): try: values = values.astype("f8") - except ValueError: + except ValueError as err: # e.g. "could not convert string to float: 'a'" - raise TypeError + raise TypeError from err if mask is not None: values[mask] = np.nan @@ -758,7 +757,7 @@ def nanvar(values, axis=None, skipna=True, ddof=1, mask=None): >>> nanops.nanvar(s) 1.0 """ - values = lib.values_from_object(values) + values = extract_array(values, extract_numpy=True) dtype = values.dtype mask = _maybe_get_mask(values, skipna, mask) if is_any_int_dtype(values): @@ -981,11 +980,11 @@ def nanskew( Examples -------- >>> import pandas.core.nanops as nanops - >>> s = pd.Series([1,np.nan, 1, 2]) + >>> s = pd.Series([1, np.nan, 1, 2]) >>> nanops.nanskew(s) 1.7320508075688787 """ - values = lib.values_from_object(values) + values = extract_array(values, extract_numpy=True) mask = _maybe_get_mask(values, skipna, mask) if not is_float_dtype(values.dtype): values = values.astype("f8") @@ -1065,11 +1064,11 @@ def nankurt( Examples -------- >>> import pandas.core.nanops as nanops - >>> s = pd.Series([1,np.nan, 1, 3, 2]) + >>> s = pd.Series([1, np.nan, 1, 3, 2]) >>> nanops.nankurt(s) -1.2892561983471076 """ - values = lib.values_from_object(values) + values = extract_array(values, extract_numpy=True) mask = _maybe_get_mask(values, skipna, mask) if not is_float_dtype(values.dtype): values = values.astype("f8") @@ -1239,7 +1238,7 @@ def _maybe_null_out( result: np.ndarray, axis: Optional[int], mask: Optional[np.ndarray], - shape: Tuple, + shape: Tuple[int, ...], min_count: int = 1, ) -> float: """ @@ -1261,16 +1260,43 @@ def _maybe_null_out( # GH12941, use None to auto cast null result[null_mask] = None elif result is not NaT: - if mask is not None: - null_mask = mask.size - mask.sum() - else: - null_mask = np.prod(shape) - if null_mask < min_count: + if check_below_min_count(shape, mask, min_count): result = np.nan return result +def check_below_min_count( + shape: Tuple[int, ...], mask: Optional[np.ndarray], min_count: int +): + """ + Check for the `min_count` keyword. Returns True if below `min_count` (when + missing value should be returned from the reduction). + + Parameters + ---------- + shape : tuple + The shape of the values (`values.shape`). + mask : ndarray or None + Boolean numpy array (typically of same shape as `shape`) or None. + min_count : int + Keyword passed through from sum/prod call. + + Returns + ------- + bool + """ + if min_count > 0: + if mask is None: + # no missing values, only check size + non_nulls = np.prod(shape) + else: + non_nulls = mask.size - mask.sum() + if non_nulls < min_count: + return True + return False + + def _zero_out_fperr(arg): # #18044 reference this behavior to fix rolling skew/kurt issue if isinstance(arg, np.ndarray): @@ -1306,30 +1332,33 @@ def nancorr( def get_corr_func(method): - if method in ["kendall", "spearman"]: - from scipy.stats import kendalltau, spearmanr - elif method in ["pearson"]: - pass - elif callable(method): - return method - else: - raise ValueError( - f"Unkown method '{method}', expected one of 'kendall', 'spearman'" - ) + if method == "kendall": + from scipy.stats import kendalltau - def _pearson(a, b): - return np.corrcoef(a, b)[0, 1] + def func(a, b): + return kendalltau(a, b)[0] - def _kendall(a, b): - # kendallttau returns a tuple of the tau statistic and pvalue - rs = kendalltau(a, b) - return rs[0] + return func + elif method == "spearman": + from scipy.stats import spearmanr - def _spearman(a, b): - return spearmanr(a, b)[0] + def func(a, b): + return spearmanr(a, b)[0] - _cor_methods = {"pearson": _pearson, "kendall": _kendall, "spearman": _spearman} - return _cor_methods[method] + return func + elif method == "pearson": + + def func(a, b): + return np.corrcoef(a, b)[0, 1] + + return func + elif callable(method): + return method + + raise ValueError( + f"Unknown method '{method}', expected one of " + "'kendall', 'spearman', 'pearson', or callable" + ) @disallow("M8", "m8") @@ -1361,9 +1390,9 @@ def _ensure_numeric(x): except (TypeError, ValueError): try: x = x.astype(np.float64) - except ValueError: + except ValueError as err: # GH#29941 we get here with object arrays containing strs - raise TypeError(f"Could not convert {x} to numeric") + raise TypeError(f"Could not convert {x} to numeric") from err else: if not np.any(np.imag(x)): x = x.real @@ -1374,9 +1403,9 @@ def _ensure_numeric(x): # e.g. "1+1j" or "foo" try: x = complex(x) - except ValueError: + except ValueError as err: # e.g. "foo" - raise TypeError(f"Could not convert {x} to numeric") + raise TypeError(f"Could not convert {x} to numeric") from err return x @@ -1501,3 +1530,75 @@ def nanpercentile( return result else: return np.percentile(values, q, axis=axis, interpolation=interpolation) + + +def na_accum_func(values: ArrayLike, accum_func, skipna: bool) -> ArrayLike: + """ + Cumulative function with skipna support. + + Parameters + ---------- + values : np.ndarray or ExtensionArray + accum_func : {np.cumprod, np.maximum.accumulate, np.cumsum, np.minimum.accumulate} + skipna : bool + + Returns + ------- + np.ndarray or ExtensionArray + """ + mask_a, mask_b = { + np.cumprod: (1.0, np.nan), + np.maximum.accumulate: (-np.inf, np.nan), + np.cumsum: (0.0, np.nan), + np.minimum.accumulate: (np.inf, np.nan), + }[accum_func] + + # We will be applying this function to block values + if values.dtype.kind in ["m", "M"]: + # GH#30460, GH#29058 + # numpy 1.18 started sorting NaTs at the end instead of beginning, + # so we need to work around to maintain backwards-consistency. + orig_dtype = values.dtype + + # We need to define mask before masking NaTs + mask = isna(values) + + if accum_func == np.minimum.accumulate: + # Note: the accum_func comparison fails as an "is" comparison + y = values.view("i8") + y[mask] = np.iinfo(np.int64).max + changed = True + else: + y = values + changed = False + + result = accum_func(y.view("i8"), axis=0) + if skipna: + result[mask] = iNaT + elif accum_func == np.minimum.accumulate: + # Restore NaTs that we masked previously + nz = (~np.asarray(mask)).nonzero()[0] + if len(nz): + # everything up to the first non-na entry stays NaT + result[: nz[0]] = iNaT + + if changed: + # restore NaT elements + y[mask] = iNaT # TODO: could try/finally for this? + + if isinstance(values, np.ndarray): + result = result.view(orig_dtype) + else: + # DatetimeArray + result = type(values)._from_sequence(result, dtype=orig_dtype) + + elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): + vals = values.copy() + mask = isna(vals) + vals[mask] = mask_a + result = accum_func(vals, axis=0) + result[mask] = mask_b + else: + result = accum_func(values, axis=0) + + return result diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index d0adf2da04db3..c14c4a311d66c 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -3,18 +3,17 @@ This is not a public API. """ -import datetime import operator -from typing import TYPE_CHECKING, Optional, Set, Tuple +from typing import TYPE_CHECKING, Optional, Set import numpy as np -from pandas._libs import Timedelta, Timestamp, lib +from pandas._libs import lib from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op # noqa:F401 -from pandas._typing import ArrayLike, Level +from pandas._typing import Level from pandas.util._decorators import Appender -from pandas.core.dtypes.common import is_list_like, is_timedelta64_dtype +from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna @@ -152,65 +151,6 @@ def _maybe_match_name(a, b): return None -def maybe_upcast_for_op(obj, shape: Tuple[int, ...]): - """ - Cast non-pandas objects to pandas types to unify behavior of arithmetic - and comparison operations. - - Parameters - ---------- - obj: object - shape : tuple[int] - - Returns - ------- - out : object - - Notes - ----- - Be careful to call this *after* determining the `name` attribute to be - attached to the result of the arithmetic operation. - """ - from pandas.core.arrays import DatetimeArray, TimedeltaArray - - if type(obj) is datetime.timedelta: - # GH#22390 cast up to Timedelta to rely on Timedelta - # implementation; otherwise operation against numeric-dtype - # raises TypeError - return Timedelta(obj) - elif isinstance(obj, np.datetime64): - # GH#28080 numpy casts integer-dtype to datetime64 when doing - # array[int] + datetime64, which we do not allow - if isna(obj): - # Avoid possible ambiguities with pd.NaT - obj = obj.astype("datetime64[ns]") - right = np.broadcast_to(obj, shape) - return DatetimeArray(right) - - return Timestamp(obj) - - elif isinstance(obj, np.timedelta64): - if isna(obj): - # wrapping timedelta64("NaT") in Timedelta returns NaT, - # which would incorrectly be treated as a datetime-NaT, so - # we broadcast and wrap in a TimedeltaArray - obj = obj.astype("timedelta64[ns]") - right = np.broadcast_to(obj, shape) - return TimedeltaArray(right) - - # In particular non-nanosecond timedelta64 needs to be cast to - # nanoseconds, or else we get undesired behavior like - # np.timedelta64(3, 'D') / 2 == np.timedelta64(1, 'D') - return Timedelta(obj) - - elif isinstance(obj, np.ndarray) and is_timedelta64_dtype(obj.dtype): - # GH#22390 Unfortunately we need to special-case right-hand - # timedelta64 dtypes because numpy casts integer dtypes to - # timedelta64 when operating with timedelta64 - return TimedeltaArray._from_sequence(obj) - return obj - - # ----------------------------------------------------------------------------- @@ -284,7 +224,7 @@ def _get_opstr(op): }[op] -def _get_op_name(op, special): +def _get_op_name(op, special: bool) -> str: """ Find the name to attach to this method according to conventions for special and non-special methods. @@ -379,7 +319,7 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None): # Get the appropriate array-op to apply to each block's values. array_op = get_array_op(func, str_rep=str_rep) - bm = left._data.apply(array_op, right=right) + bm = left._mgr.apply(array_op, right=right) return type(left)(bm) elif isinstance(right, ABCDataFrame): @@ -445,42 +385,6 @@ def _align_method_SERIES(left, right, align_asobject=False): return left, right -def _construct_result( - left: ABCSeries, result: ArrayLike, index: ABCIndexClass, name, -): - """ - Construct an appropriately-labelled Series from the result of an op. - - Parameters - ---------- - left : Series - result : ndarray or ExtensionArray - index : Index - name : object - - Returns - ------- - Series - In the case of __divmod__ or __rdivmod__, a 2-tuple of Series. - """ - if isinstance(result, tuple): - # produced by divmod or rdivmod - return ( - _construct_result(left, result[0], index=index, name=name), - _construct_result(left, result[1], index=index, name=name), - ) - - # We do not pass dtype to ensure that the Series constructor - # does inference in the case where `result` has object-dtype. - out = left._constructor(result, index=index) - out = out.__finalize__(left) - - # Set the result's name after __finalize__ is called because __finalize__ - # would set it back to self.name - out.name = name - return out - - def _arith_method_SERIES(cls, op, special): """ Wrapper function for Series arithmetic operations, to avoid @@ -499,7 +403,7 @@ def wrapper(left, right): rvalues = extract_array(right, extract_numpy=True) result = arithmetic_op(lvalues, rvalues, op, str_rep) - return _construct_result(left, result, index=left.index, name=res_name) + return left._construct_result(result, name=res_name) wrapper.__name__ = op_name return wrapper @@ -526,7 +430,7 @@ def wrapper(self, other): res_values = comparison_op(lvalues, rvalues, op, str_rep) - return _construct_result(self, res_values, index=self.index, name=res_name) + return self._construct_result(res_values, name=res_name) wrapper.__name__ = op_name return wrapper @@ -548,7 +452,7 @@ def wrapper(self, other): rvalues = extract_array(other, extract_numpy=True) res_values = logical_op(lvalues, rvalues, op) - return _construct_result(self, res_values, index=self.index, name=res_name) + return self._construct_result(res_values, name=res_name) wrapper.__name__ = op_name return wrapper @@ -585,7 +489,7 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): # DataFrame -def _combine_series_frame(left, right, func, axis: int): +def _combine_series_frame(left, right, func, axis: int, str_rep: str): """ Apply binary operator `func` to self, other using alignment and fill conventions determined by the axis argument. @@ -596,6 +500,7 @@ def _combine_series_frame(left, right, func, axis: int): right : Series func : binary operator axis : {0, 1} + str_rep : str Returns ------- @@ -603,7 +508,17 @@ def _combine_series_frame(left, right, func, axis: int): """ # We assume that self.align(other, ...) has already been called if axis == 0: - new_data = left._combine_match_index(right, func) + values = right._values + if isinstance(values, np.ndarray): + # We can operate block-wise + values = values.reshape(-1, 1) + + array_op = get_array_op(func, str_rep=str_rep) + bm = left._mgr.apply(array_op, right=values.T) + return type(left)(bm) + + new_data = dispatch_to_series(left, right, func) + else: new_data = dispatch_to_series(left, right, func, axis="columns") @@ -674,7 +589,8 @@ def to_series(right): elif right.ndim > 2: raise ValueError( - f"Unable to coerce to Series/DataFrame, dim must be <= 2: {right.shape}" + "Unable to coerce to Series/DataFrame, " + f"dimension must be <= 2: {right.shape}" ) elif is_list_like(right) and not isinstance(right, (ABCSeries, ABCDataFrame)): @@ -700,13 +616,17 @@ def to_series(right): def _should_reindex_frame_op( - left: "DataFrame", right, axis, default_axis: int, fill_value, level + left: "DataFrame", right, op, axis, default_axis: int, fill_value, level ) -> bool: """ Check if this is an operation between DataFrames that will need to reindex. """ assert isinstance(left, ABCDataFrame) + if op is operator.pow or op is rpow: + # GH#32685 pow has special semantics for operating with null values + return False + if not isinstance(right, ABCDataFrame): return False @@ -768,7 +688,9 @@ def _arith_method_FRAME(cls, op, special): @Appender(doc) def f(self, other, axis=default_axis, level=None, fill_value=None): - if _should_reindex_frame_op(self, other, axis, default_axis, fill_value, level): + if _should_reindex_frame_op( + self, other, op, axis, default_axis, fill_value, level + ): return _frame_arith_method_with_reindex(self, other, op) self, other = _align_method_FRAME(self, other, axis, flex=True, level=level) @@ -791,7 +713,9 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): raise NotImplementedError(f"fill_value {fill_value} not supported.") axis = self._get_axis_number(axis) if axis is not None else 1 - return _combine_series_frame(self, other, pass_op, axis=axis) + return _combine_series_frame( + self, other, pass_op, axis=axis, str_rep=str_rep + ) else: # in this case we always have `np.ndim(other) == 0` if fill_value is not None: @@ -826,7 +750,7 @@ def f(self, other, axis=default_axis, level=None): elif isinstance(other, ABCSeries): axis = self._get_axis_number(axis) if axis is not None else 1 - return _combine_series_frame(self, other, op, axis=axis) + return _combine_series_frame(self, other, op, axis=axis, str_rep=str_rep) else: # in this case we always have `np.ndim(other) == 0` new_data = dispatch_to_series(self, other, op, str_rep) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index b216a927f65b3..5dd7af454cbd1 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -2,9 +2,10 @@ Functions for arithmetic and comparison operations on NumPy arrays and ExtensionArrays. """ +from datetime import timedelta from functools import partial import operator -from typing import Any, Optional +from typing import Any, Optional, Tuple import numpy as np @@ -24,18 +25,11 @@ is_object_dtype, is_scalar, ) -from pandas.core.dtypes.generic import ( - ABCDatetimeArray, - ABCExtensionArray, - ABCIndex, - ABCIndexClass, - ABCSeries, - ABCTimedeltaArray, -) +from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndex, ABCSeries from pandas.core.dtypes.missing import isna, notna from pandas.core.ops import missing -from pandas.core.ops.dispatch import dispatch_to_extension_op, should_extension_dispatch +from pandas.core.ops.dispatch import should_extension_dispatch from pandas.core.ops.invalid import invalid_comparison from pandas.core.ops.roperator import rpow @@ -51,15 +45,17 @@ def comp_method_OBJECT_ARRAY(op, x, y): y = y.astype(np.object_) if isinstance(y, (ABCSeries, ABCIndex)): - y = y.values + y = y._values - result = libops.vec_compare(x.ravel(), y, op) + if x.shape != y.shape: + raise ValueError("Shapes must match", x.shape, y.shape) + result = libops.vec_compare(x.ravel(), y.ravel(), op) else: result = libops.scalar_compare(x.ravel(), y, op) return result.reshape(x.shape) -def masked_arith_op(x, y, op): +def masked_arith_op(x: np.ndarray, y, op): """ If the given arithmetic operation fails, attempt it again on only the non-null elements of the input array(s). @@ -78,10 +74,22 @@ def masked_arith_op(x, y, op): dtype = find_common_type([x.dtype, y.dtype]) result = np.empty(x.size, dtype=dtype) + if len(x) != len(y): + if not _can_broadcast(x, y): + raise ValueError(x.shape, y.shape) + + # Call notna on pre-broadcasted y for performance + ymask = notna(y) + y = np.broadcast_to(y, x.shape) + ymask = np.broadcast_to(ymask, x.shape) + + else: + ymask = notna(y) + # NB: ravel() is only safe since y is ndarray; for e.g. PeriodIndex # we would get int64 dtype, see GH#19956 yrav = y.ravel() - mask = notna(xrav) & notna(yrav) + mask = notna(xrav) & ymask.ravel() if yrav.shape != mask.shape: # FIXME: GH#5284, GH#5035, GH#19448 @@ -186,23 +194,15 @@ def arithmetic_op(left: ArrayLike, right: Any, op, str_rep: str): ndarrray or ExtensionArray Or a 2-tuple of these in the case of divmod or rdivmod. """ - from pandas.core.ops import maybe_upcast_for_op # NB: We assume that extract_array has already been called # on `left` and `right`. - lvalues = left - rvalues = right - - rvalues = maybe_upcast_for_op(rvalues, lvalues.shape) + lvalues = maybe_upcast_datetimelike_array(left) + rvalues = maybe_upcast_for_op(right, lvalues.shape) - if should_extension_dispatch(left, rvalues) or isinstance( - rvalues, (ABCTimedeltaArray, ABCDatetimeArray, Timestamp, Timedelta) - ): - # TimedeltaArray, DatetimeArray, and Timestamp are included here - # because they have `freq` attribute which is handled correctly - # by dispatch_to_extension_op. + if should_extension_dispatch(lvalues, rvalues) or isinstance(rvalues, Timedelta): # Timedelta is included because numexpr will fail on it, see GH#31457 - res_values = dispatch_to_extension_op(op, lvalues, rvalues) + res_values = op(lvalues, rvalues) else: with np.errstate(all="ignore"): @@ -211,6 +211,51 @@ def arithmetic_op(left: ArrayLike, right: Any, op, str_rep: str): return res_values +def _broadcast_comparison_op(lvalues, rvalues, op) -> np.ndarray: + """ + Broadcast a comparison operation between two 2D arrays. + + Parameters + ---------- + lvalues : np.ndarray or ExtensionArray + rvalues : np.ndarray or ExtensionArray + + Returns + ------- + np.ndarray[bool] + """ + if isinstance(rvalues, np.ndarray): + rvalues = np.broadcast_to(rvalues, lvalues.shape) + result = comparison_op(lvalues, rvalues, op) + else: + result = np.empty(lvalues.shape, dtype=bool) + for i in range(len(lvalues)): + result[i, :] = comparison_op(lvalues[i], rvalues[:, 0], op) + return result + + +def _can_broadcast(lvalues, rvalues) -> bool: + """ + Check if we can broadcast rvalues to match the shape of lvalues. + + Parameters + ---------- + lvalues : np.ndarray or ExtensionArray + rvalues : np.ndarray or ExtensionArray + + Returns + ------- + bool + """ + # We assume that lengths dont match + if lvalues.ndim == rvalues.ndim == 2: + # See if we can broadcast unambiguously + if lvalues.shape[1] == rvalues.shape[-1]: + if rvalues.shape[0] == 1: + return True + return False + + def comparison_op( left: ArrayLike, right: Any, op, str_rep: Optional[str] = None, ) -> ArrayLike: @@ -229,7 +274,7 @@ def comparison_op( ndarray or ExtensionArray """ # NB: We assume extract_array has already been called on left and right - lvalues = left + lvalues = maybe_upcast_datetimelike_array(left) rvalues = right rvalues = lib.item_from_zerodim(rvalues) @@ -237,15 +282,20 @@ def comparison_op( # TODO: same for tuples? rvalues = np.asarray(rvalues) - if isinstance(rvalues, (np.ndarray, ABCExtensionArray, ABCIndexClass)): + if isinstance(rvalues, (np.ndarray, ABCExtensionArray)): # TODO: make this treatment consistent across ops and classes. # We are not catching all listlikes here (e.g. frozenset, tuple) # The ambiguous case is object-dtype. See GH#27803 if len(lvalues) != len(rvalues): - raise ValueError("Lengths must match to compare") + if _can_broadcast(lvalues, rvalues): + return _broadcast_comparison_op(lvalues, rvalues, op) + raise ValueError( + "Lengths must match to compare", lvalues.shape, rvalues.shape + ) if should_extension_dispatch(lvalues, rvalues): - res_values = dispatch_to_extension_op(op, lvalues, rvalues) + # Call the method on lvalues + res_values = op(lvalues, rvalues) elif is_scalar(rvalues) and isna(rvalues): # numpy does not like comparisons vs None @@ -295,12 +345,12 @@ def na_logical_op(x: np.ndarray, y, op): AttributeError, OverflowError, NotImplementedError, - ): + ) as err: typ = type(y).__name__ raise TypeError( f"Cannot perform '{op.__name__}' with a dtyped [{x.dtype}] array " f"and scalar of type [{typ}]" - ) + ) from err return result.reshape(x.shape) @@ -344,11 +394,12 @@ def fill_bool(x, left=None): right = construct_1d_object_array_from_listlike(right) # NB: We assume extract_array has already been called on left and right - lvalues = left + lvalues = maybe_upcast_datetimelike_array(left) rvalues = right if should_extension_dispatch(lvalues, rvalues): - res_values = dispatch_to_extension_op(op, lvalues, rvalues) + # Call the method on lvalues + res_values = op(lvalues, rvalues) else: if isinstance(rvalues, np.ndarray): @@ -391,3 +442,87 @@ def get_array_op(op, str_rep: Optional[str] = None): return partial(logical_op, op=op) else: return partial(arithmetic_op, op=op, str_rep=str_rep) + + +def maybe_upcast_datetimelike_array(obj: ArrayLike) -> ArrayLike: + """ + If we have an ndarray that is either datetime64 or timedelta64, wrap in EA. + + Parameters + ---------- + obj : ndarray or ExtensionArray + + Returns + ------- + ndarray or ExtensionArray + """ + if isinstance(obj, np.ndarray): + if obj.dtype.kind == "m": + from pandas.core.arrays import TimedeltaArray + + return TimedeltaArray._from_sequence(obj) + if obj.dtype.kind == "M": + from pandas.core.arrays import DatetimeArray + + return DatetimeArray._from_sequence(obj) + + return obj + + +def maybe_upcast_for_op(obj, shape: Tuple[int, ...]): + """ + Cast non-pandas objects to pandas types to unify behavior of arithmetic + and comparison operations. + + Parameters + ---------- + obj: object + shape : tuple[int] + + Returns + ------- + out : object + + Notes + ----- + Be careful to call this *after* determining the `name` attribute to be + attached to the result of the arithmetic operation. + """ + from pandas.core.arrays import DatetimeArray, TimedeltaArray + + if type(obj) is timedelta: + # GH#22390 cast up to Timedelta to rely on Timedelta + # implementation; otherwise operation against numeric-dtype + # raises TypeError + return Timedelta(obj) + elif isinstance(obj, np.datetime64): + # GH#28080 numpy casts integer-dtype to datetime64 when doing + # array[int] + datetime64, which we do not allow + if isna(obj): + # Avoid possible ambiguities with pd.NaT + obj = obj.astype("datetime64[ns]") + right = np.broadcast_to(obj, shape) + return DatetimeArray(right) + + return Timestamp(obj) + + elif isinstance(obj, np.timedelta64): + if isna(obj): + # wrapping timedelta64("NaT") in Timedelta returns NaT, + # which would incorrectly be treated as a datetime-NaT, so + # we broadcast and wrap in a TimedeltaArray + obj = obj.astype("timedelta64[ns]") + right = np.broadcast_to(obj, shape) + return TimedeltaArray(right) + + # In particular non-nanosecond timedelta64 needs to be cast to + # nanoseconds, or else we get undesired behavior like + # np.timedelta64(3, 'D') / 2 == np.timedelta64(1, 'D') + return Timedelta(obj) + + elif isinstance(obj, np.ndarray) and obj.dtype.kind == "m": + # GH#22390 Unfortunately we need to special-case right-hand + # timedelta64 dtypes because numpy casts integer dtypes to + # timedelta64 when operating with timedelta64 + return TimedeltaArray._from_sequence(obj) + return obj diff --git a/pandas/core/ops/dispatch.py b/pandas/core/ops/dispatch.py index 61a3032c7a02c..2463a1f58a447 100644 --- a/pandas/core/ops/dispatch.py +++ b/pandas/core/ops/dispatch.py @@ -1,48 +1,33 @@ """ Functions for defining unary operations. """ -from typing import Any, Union +from typing import Any -import numpy as np +from pandas._typing import ArrayLike from pandas.core.dtypes.common import ( is_datetime64_dtype, - is_extension_array_dtype, is_integer_dtype, is_object_dtype, - is_scalar, is_timedelta64_dtype, ) -from pandas.core.dtypes.generic import ABCExtensionArray, ABCSeries +from pandas.core.dtypes.generic import ABCExtensionArray -from pandas.core.construction import array - -def should_extension_dispatch(left: ABCSeries, right: Any) -> bool: +def should_extension_dispatch(left: ArrayLike, right: Any) -> bool: """ - Identify cases where Series operation should use dispatch_to_extension_op. + Identify cases where Series operation should dispatch to ExtensionArray method. Parameters ---------- - left : Series + left : np.ndarray or ExtensionArray right : object Returns ------- bool """ - if ( - is_extension_array_dtype(left.dtype) - or is_datetime64_dtype(left.dtype) - or is_timedelta64_dtype(left.dtype) - ): - return True - - if not is_scalar(right) and is_extension_array_dtype(right): - # GH#22378 disallow scalar to exclude e.g. "category", "Int64" - return True - - return False + return isinstance(left, ABCExtensionArray) or isinstance(right, ABCExtensionArray) def should_series_dispatch(left, right, op): @@ -91,36 +76,3 @@ def should_series_dispatch(left, right, op): return True return False - - -def dispatch_to_extension_op( - op, left: Union[ABCExtensionArray, np.ndarray], right: Any, -): - """ - Assume that left or right is a Series backed by an ExtensionArray, - apply the operator defined by op. - - Parameters - ---------- - op : binary operator - left : ExtensionArray or np.ndarray - right : object - - Returns - ------- - ExtensionArray or np.ndarray - 2-tuple of these if op is divmod or rdivmod - """ - # NB: left and right should already be unboxed, so neither should be - # a Series or Index. - - if left.dtype.kind in "mM" and isinstance(left, np.ndarray): - # We need to cast datetime64 and timedelta64 ndarrays to - # DatetimeArray/TimedeltaArray. But we avoid wrapping others in - # PandasArray as that behaves poorly with e.g. IntegerArray. - left = array(left) - - # The op calls will raise TypeError if the op is not defined - # on the ExtensionArray - res_values = op(left, right) - return res_values diff --git a/pandas/core/ops/docstrings.py b/pandas/core/ops/docstrings.py index 203ea3946d1b2..449a477646c02 100644 --- a/pandas/core/ops/docstrings.py +++ b/pandas/core/ops/docstrings.py @@ -29,11 +29,14 @@ def _make_flex_doc(op_name, typ): if typ == "series": base_doc = _flex_doc_SERIES + if op_desc["reverse"]: + base_doc += _see_also_reverse_SERIES.format( + reverse=op_desc["reverse"], see_also_desc=op_desc["see_also_desc"], + ) doc_no_examples = base_doc.format( desc=op_desc["desc"], op_name=op_name, equiv=equiv, - reverse=op_desc["reverse"], series_returns=op_desc["series_returns"], ) if op_desc["series_examples"]: @@ -53,7 +56,7 @@ def _make_flex_doc(op_name, typ): return doc -_add_example_SERIES = """ +_common_examples_algebra_SERIES = """ Examples -------- >>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) @@ -69,33 +72,44 @@ def _make_flex_doc(op_name, typ): b NaN d 1.0 e NaN -dtype: float64 ->>> a.add(b, fill_value=0) -a 2.0 -b 1.0 -c 1.0 -d 1.0 -e NaN -dtype: float64 -""" +dtype: float64""" -_sub_example_SERIES = """ +_common_examples_comparison_SERIES = """ Examples -------- ->>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a = pd.Series([1, 1, 1, np.nan, 1], index=['a', 'b', 'c', 'd', 'e']) >>> a a 1.0 b 1.0 c 1.0 d NaN +e 1.0 dtype: float64 ->>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b = pd.Series([0, 1, 2, np.nan, 1], index=['a', 'b', 'c', 'd', 'f']) >>> b -a 1.0 -b NaN +a 0.0 +b 1.0 +c 2.0 +d NaN +f 1.0 +dtype: float64""" + +_add_example_SERIES = ( + _common_examples_algebra_SERIES + + """ +>>> a.add(b, fill_value=0) +a 2.0 +b 1.0 +c 1.0 d 1.0 e NaN dtype: float64 +""" +) + +_sub_example_SERIES = ( + _common_examples_algebra_SERIES + + """ >>> a.subtract(b, fill_value=0) a 0.0 b 1.0 @@ -104,24 +118,11 @@ def _make_flex_doc(op_name, typ): e NaN dtype: float64 """ +) -_mul_example_SERIES = """ -Examples --------- ->>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) ->>> a -a 1.0 -b 1.0 -c 1.0 -d NaN -dtype: float64 ->>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) ->>> b -a 1.0 -b NaN -d 1.0 -e NaN -dtype: float64 +_mul_example_SERIES = ( + _common_examples_algebra_SERIES + + """ >>> a.multiply(b, fill_value=0) a 1.0 b 0.0 @@ -130,24 +131,11 @@ def _make_flex_doc(op_name, typ): e NaN dtype: float64 """ +) -_div_example_SERIES = """ -Examples --------- ->>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) ->>> a -a 1.0 -b 1.0 -c 1.0 -d NaN -dtype: float64 ->>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) ->>> b -a 1.0 -b NaN -d 1.0 -e NaN -dtype: float64 +_div_example_SERIES = ( + _common_examples_algebra_SERIES + + """ >>> a.divide(b, fill_value=0) a 1.0 b inf @@ -156,24 +144,11 @@ def _make_flex_doc(op_name, typ): e NaN dtype: float64 """ +) -_floordiv_example_SERIES = """ -Examples --------- ->>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) ->>> a -a 1.0 -b 1.0 -c 1.0 -d NaN -dtype: float64 ->>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) ->>> b -a 1.0 -b NaN -d 1.0 -e NaN -dtype: float64 +_floordiv_example_SERIES = ( + _common_examples_algebra_SERIES + + """ >>> a.floordiv(b, fill_value=0) a 1.0 b NaN @@ -182,24 +157,11 @@ def _make_flex_doc(op_name, typ): e NaN dtype: float64 """ +) -_mod_example_SERIES = """ -Examples --------- ->>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) ->>> a -a 1.0 -b 1.0 -c 1.0 -d NaN -dtype: float64 ->>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) ->>> b -a 1.0 -b NaN -d 1.0 -e NaN -dtype: float64 +_mod_example_SERIES = ( + _common_examples_algebra_SERIES + + """ >>> a.mod(b, fill_value=0) a 0.0 b NaN @@ -208,23 +170,10 @@ def _make_flex_doc(op_name, typ): e NaN dtype: float64 """ -_pow_example_SERIES = """ -Examples --------- ->>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) ->>> a -a 1.0 -b 1.0 -c 1.0 -d NaN -dtype: float64 ->>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) ->>> b -a 1.0 -b NaN -d 1.0 -e NaN -dtype: float64 +) +_pow_example_SERIES = ( + _common_examples_algebra_SERIES + + """ >>> a.pow(b, fill_value=0) a 1.0 b 1.0 @@ -233,6 +182,89 @@ def _make_flex_doc(op_name, typ): e NaN dtype: float64 """ +) + +_ne_example_SERIES = ( + _common_examples_algebra_SERIES + + """ +>>> a.ne(b, fill_value=0) +a False +b True +c True +d True +e True +dtype: bool +""" +) + +_eq_example_SERIES = ( + _common_examples_algebra_SERIES + + """ +>>> a.eq(b, fill_value=0) +a True +b False +c False +d False +e False +dtype: bool +""" +) + +_lt_example_SERIES = ( + _common_examples_comparison_SERIES + + """ +>>> a.lt(b, fill_value=0) +a False +b False +c True +d False +e False +f True +dtype: bool +""" +) + +_le_example_SERIES = ( + _common_examples_comparison_SERIES + + """ +>>> a.le(b, fill_value=0) +a False +b True +c True +d False +e False +f True +dtype: bool +""" +) + +_gt_example_SERIES = ( + _common_examples_comparison_SERIES + + """ +>>> a.gt(b, fill_value=0) +a True +b False +c False +d False +e True +f False +dtype: bool +""" +) + +_ge_example_SERIES = ( + _common_examples_comparison_SERIES + + """ +>>> a.ge(b, fill_value=0) +a True +b True +c False +d False +e True +f False +dtype: bool +""" +) _returns_series = """Series\n The result of the operation.""" @@ -306,52 +338,62 @@ def _make_flex_doc(op_name, typ): "op": "==", "desc": "Equal to", "reverse": None, - "series_examples": None, + "series_examples": _eq_example_SERIES, "series_returns": _returns_series, }, "ne": { "op": "!=", "desc": "Not equal to", "reverse": None, - "series_examples": None, + "series_examples": _ne_example_SERIES, "series_returns": _returns_series, }, "lt": { "op": "<", "desc": "Less than", "reverse": None, - "series_examples": None, + "series_examples": _lt_example_SERIES, "series_returns": _returns_series, }, "le": { "op": "<=", "desc": "Less than or equal to", "reverse": None, - "series_examples": None, + "series_examples": _le_example_SERIES, "series_returns": _returns_series, }, "gt": { "op": ">", "desc": "Greater than", "reverse": None, - "series_examples": None, + "series_examples": _gt_example_SERIES, "series_returns": _returns_series, }, "ge": { "op": ">=", "desc": "Greater than or equal to", "reverse": None, - "series_examples": None, + "series_examples": _ge_example_SERIES, "series_returns": _returns_series, }, } +_py_num_ref = """see + `Python documentation + `_ + for more details""" _op_names = list(_op_descriptions.keys()) for key in _op_names: reverse_op = _op_descriptions[key]["reverse"] if reverse_op is not None: _op_descriptions[reverse_op] = _op_descriptions[key].copy() _op_descriptions[reverse_op]["reverse"] = key + _op_descriptions[key][ + "see_also_desc" + ] = f"Reverse of the {_op_descriptions[key]['desc']} operator, {_py_num_ref}" + _op_descriptions[reverse_op][ + "see_also_desc" + ] = f"Element-wise {_op_descriptions[key]['desc']}, {_py_num_ref}" _flex_doc_SERIES = """ Return {desc} of series and other, element-wise (binary operator `{op_name}`). @@ -374,10 +416,12 @@ def _make_flex_doc(op_name, typ): Returns ------- {series_returns} +""" +_see_also_reverse_SERIES = """ See Also -------- -Series.{reverse} +Series.{reverse} : {see_also_desc}. """ _arith_doc_FRAME = """ diff --git a/pandas/core/ops/methods.py b/pandas/core/ops/methods.py index c04658565f235..7c63bc43de67e 100644 --- a/pandas/core/ops/methods.py +++ b/pandas/core/ops/methods.py @@ -93,11 +93,12 @@ def _wrap_inplace_method(method): def f(self, other): result = method(self, other) - + # Delete cacher + self._reset_cacher() # this makes sure that we are aligned like the input # we are updating inplace so we want to ignore is_copy self._update_inplace( - result.reindex_like(self, copy=False)._data, verify_is_copy=False + result.reindex_like(self, copy=False), verify_is_copy=False ) return self diff --git a/pandas/core/ops/missing.py b/pandas/core/ops/missing.py index 854d6072eea36..c33cb32dcec19 100644 --- a/pandas/core/ops/missing.py +++ b/pandas/core/ops/missing.py @@ -72,7 +72,7 @@ def fill_zeros(result, x, y): def mask_zero_div_zero(x, y, result): """ - Set results of 0 / 0 or 0 // 0 to np.nan, regardless of the dtypes + Set results of 0 // 0 to np.nan, regardless of the dtypes of the numerator or the denominator. Parameters @@ -83,13 +83,16 @@ def mask_zero_div_zero(x, y, result): Returns ------- - filled_result : ndarray + ndarray + The filled result. Examples -------- >>> x = np.array([1, 0, -1], dtype=np.int64) + >>> x + array([ 1, 0, -1]) >>> y = 0 # int 0; numpy behavior is different with float - >>> result = x / y + >>> result = x // y >>> result # raw numpy result does not fill division by zero array([0, 0, 0]) >>> mask_zero_div_zero(x, y, result) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index f19a82ab6f86a..1e93597d92a5d 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -11,13 +11,13 @@ from pandas._libs.tslibs.period import IncompatibleFrequency from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError -from pandas.util._decorators import Appender, Substitution +from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries import pandas.core.algorithms as algos from pandas.core.base import DataError, ShallowMixin -from pandas.core.generic import _shared_docs +from pandas.core.generic import NDFrame, _shared_docs from pandas.core.groupby.base import GroupByMixin from pandas.core.groupby.generic import SeriesGroupBy from pandas.core.groupby.groupby import GroupBy, _GroupBy, _pipe_template, get_groupby @@ -467,8 +467,6 @@ def nearest(self, limit=None): limit : int, optional Limit of how many values to fill. - .. versionadded:: 0.21.0 - Returns ------- Series or DataFrame @@ -775,7 +773,7 @@ def fillna(self, method, limit=None): """ return self._upsample(method, limit=limit) - @Appender(_shared_docs["interpolate"] % _shared_docs_kwargs) + @doc(NDFrame.interpolate, **_shared_docs_kwargs) def interpolate( self, method="linear", @@ -858,7 +856,7 @@ def var(self, ddof=1, *args, **kwargs): nv.validate_resampler_func("var", args, kwargs) return self._downsample("var", ddof=ddof) - @Appender(GroupBy.size.__doc__) + @doc(GroupBy.size) def size(self): result = self._downsample("size") if not len(self.ax): @@ -871,7 +869,7 @@ def size(self): result = Series([], index=result.index, dtype="int64", name=name) return result - @Appender(GroupBy.count.__doc__) + @doc(GroupBy.count) def count(self): result = self._downsample("count") if not len(self.ax): @@ -1422,13 +1420,15 @@ def _get_time_bins(self, ax): # because replace() will swallow the nanosecond part # thus last bin maybe slightly before the end if the end contains # nanosecond part and lead to `Values falls after last bin` error + # GH 25758: If DST lands at midnight (e.g. 'America/Havana'), user feedback + # has noted that ambiguous=True provides the most sensible result binner = labels = date_range( freq=self.freq, start=first, end=last, tz=ax.tz, name=ax.name, - ambiguous="infer", + ambiguous=True, nonexistent="shift_forward", ) @@ -1596,13 +1596,13 @@ def _get_period_bins(self, ax): def _take_new_index(obj, indexer, new_index, axis=0): if isinstance(obj, ABCSeries): - new_values = algos.take_1d(obj.values, indexer) + new_values = algos.take_1d(obj._values, indexer) return obj._constructor(new_values, index=new_index, name=obj.name) elif isinstance(obj, ABCDataFrame): if axis == 1: raise NotImplementedError("axis 1 is not supported") return obj._constructor( - obj._data.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1) + obj._mgr.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1) ) else: raise ValueError("'obj' should be either a Series or a DataFrame") diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index d9f21f0b274ac..a868e663b06a5 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -2,12 +2,14 @@ Concat routines. """ -from typing import Hashable, Iterable, List, Mapping, Optional, Union, overload +from collections import abc +from typing import Iterable, List, Mapping, Union, overload import numpy as np -from pandas._typing import FrameOrSeriesUnion +from pandas._typing import FrameOrSeriesUnion, Label +from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas import DataFrame, Index, MultiIndex, Series @@ -32,7 +34,7 @@ @overload def concat( - objs: Union[Iterable["DataFrame"], Mapping[Optional[Hashable], "DataFrame"]], + objs: Union[Iterable["DataFrame"], Mapping[Label, "DataFrame"]], axis=0, join: str = "outer", ignore_index: bool = False, @@ -48,9 +50,7 @@ def concat( @overload def concat( - objs: Union[ - Iterable[FrameOrSeriesUnion], Mapping[Optional[Hashable], FrameOrSeriesUnion] - ], + objs: Union[Iterable[FrameOrSeriesUnion], Mapping[Label, FrameOrSeriesUnion]], axis=0, join: str = "outer", ignore_index: bool = False, @@ -65,9 +65,7 @@ def concat( def concat( - objs: Union[ - Iterable[FrameOrSeriesUnion], Mapping[Optional[Hashable], FrameOrSeriesUnion] - ], + objs: Union[Iterable[FrameOrSeriesUnion], Mapping[Label, FrameOrSeriesUnion]], axis=0, join="outer", ignore_index: bool = False, @@ -89,7 +87,7 @@ def concat( Parameters ---------- objs : a sequence or mapping of Series or DataFrame objects - If a dict is passed, the sorted keys will be used as the `keys` + If a mapping is passed, the sorted keys will be used as the `keys` argument, unless it is passed, in which case the values will be selected (see below). Any None objects will be dropped silently unless they are all None in which case a ValueError will be raised. @@ -319,7 +317,7 @@ def __init__( "Only can inner (intersect) or outer (union) join the other axis" ) - if isinstance(objs, dict): + if isinstance(objs, abc.Mapping): if keys is None: keys = list(objs.keys()) objs = [objs[k] for k in keys] @@ -398,7 +396,7 @@ def __init__( # Need to flip BlockManager axis in the DataFrame special case self._is_frame = isinstance(sample, ABCDataFrame) if self._is_frame: - axis = 1 if axis == 0 else 0 + axis = DataFrame._get_block_manager_axis(axis) self._is_series = isinstance(sample, ABCSeries) if not 0 <= axis <= sample.ndim: @@ -439,7 +437,8 @@ def __init__( self.objs.append(obj) # note: this is the BlockManager axis (since DataFrame is transposed) - self.axis = axis + self.bm_axis = axis + self.axis = 1 - self.bm_axis if self._is_frame else 0 self.keys = keys self.names = names or getattr(keys, "names", None) self.levels = levels @@ -457,14 +456,15 @@ def get_result(self): if self._is_series: # stack blocks - if self.axis == 0: + if self.bm_axis == 0: name = com.consensus_name_attr(self.objs) - - mgr = self.objs[0]._data.concat( - [x._data for x in self.objs], self.new_axes - ) cons = self.objs[0]._constructor - return cons(mgr, name=name).__finalize__(self, method="concat") + + arrs = [ser._values for ser in self.objs] + + res = concat_compat(arrs, axis=0) + result = cons(res, index=self.new_axes[0], name=name, dtype=res.dtype) + return result.__finalize__(self, method="concat") # combine as columns in a frame else: @@ -480,21 +480,22 @@ def get_result(self): else: mgrs_indexers = [] for obj in self.objs: - mgr = obj._data indexers = {} for ax, new_labels in enumerate(self.new_axes): - if ax == self.axis: + # ::-1 to convert BlockManager ax to DataFrame ax + if ax == self.bm_axis: # Suppress reindexing on concat axis continue - obj_labels = mgr.axes[ax] + # 1-ax to convert BlockManager axis to DataFrame axis + obj_labels = obj.axes[1 - ax] if not new_labels.equals(obj_labels): indexers[ax] = obj_labels.reindex(new_labels)[1] - mgrs_indexers.append((obj._data, indexers)) + mgrs_indexers.append((obj._mgr, indexers)) new_data = concatenate_block_managers( - mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy + mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy ) if not self.copy: new_data._consolidate_inplace() @@ -503,7 +504,7 @@ def get_result(self): return cons(new_data).__finalize__(self, method="concat") def _get_result_dim(self) -> int: - if self._is_series and self.axis == 1: + if self._is_series and self.bm_axis == 1: return 2 else: return self.objs[0].ndim @@ -511,7 +512,7 @@ def _get_result_dim(self) -> int: def _get_new_axes(self) -> List[Index]: ndim = self._get_result_dim() return [ - self._get_concat_axis() if i == self.axis else self._get_comb_axis(i) + self._get_concat_axis() if i == self.bm_axis else self._get_comb_axis(i) for i in range(ndim) ] @@ -530,13 +531,13 @@ def _get_concat_axis(self) -> Index: Return index to be used along concatenation axis. """ if self._is_series: - if self.axis == 0: + if self.bm_axis == 0: indexes = [x.index for x in self.objs] elif self.ignore_index: idx = ibase.default_index(len(self.objs)) return idx elif self.keys is None: - names: List[Optional[Hashable]] = [None] * len(self.objs) + names: List[Label] = [None] * len(self.objs) num = 0 has_names = False for i, x in enumerate(self.objs): @@ -558,7 +559,7 @@ def _get_concat_axis(self) -> Index: else: return ensure_index(self.keys).set_names(self.names) else: - indexes = [x._data.axes[self.axis] for x in self.objs] + indexes = [x.axes[self.axis] for x in self.objs] if self.ignore_index: idx = ibase.default_index(sum(len(i) for i in indexes)) @@ -620,8 +621,8 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde for key, index in zip(hlevel, indexes): try: i = level.get_loc(key) - except KeyError: - raise ValueError(f"Key {key} not in level {level}") + except KeyError as err: + raise ValueError(f"Key {key} not in level {level}") from err to_concat.append(np.repeat(i, len(index))) codes_list.append(np.concatenate(to_concat)) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 782b8043430e1..c3e170b0e39c4 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -105,12 +105,12 @@ def melt( if is_extension_array_dtype(id_data): id_data = concat([id_data] * K, ignore_index=True) else: - id_data = np.tile(id_data.values, K) + id_data = np.tile(id_data._values, K) mdata[col] = id_data mcolumns = id_vars + var_name + [value_name] - mdata[value_name] = frame.values.ravel("F") + mdata[value_name] = frame._values.ravel("F") for i, col in enumerate(var_name): # asanyarray will keep the columns as an Index mdata[col] = np.asanyarray(frame.columns._get_level_values(i)).repeat(N) @@ -170,13 +170,13 @@ def lreshape(data: DataFrame, groups, dropna: bool = True, label=None) -> DataFr pivot_cols = [] for target, names in zip(keys, values): - to_concat = [data[col].values for col in names] + to_concat = [data[col]._values for col in names] mdata[target] = concat_compat(to_concat) pivot_cols.append(target) for col in id_cols: - mdata[col] = np.tile(data[col].values, K) + mdata[col] = np.tile(data[col]._values, K) if dropna: mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 49ac1b6cfa52b..607a1b75dcfcd 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -13,7 +13,7 @@ from pandas._libs import Timedelta, hashtable as libhashtable, lib import pandas._libs.join as libjoin -from pandas._typing import FrameOrSeries +from pandas._typing import ArrayLike, FrameOrSeries from pandas.errors import MergeError from pandas.util._decorators import Appender, Substitution @@ -43,10 +43,11 @@ from pandas import Categorical, Index, MultiIndex from pandas.core import groupby import pandas.core.algorithms as algos -from pandas.core.arrays.categorical import _recode_for_categories +from pandas.core.arrays.categorical import recode_for_categories import pandas.core.common as com +from pandas.core.construction import extract_array from pandas.core.frame import _merge_doc -from pandas.core.internals import _transform_index, concatenate_block_managers +from pandas.core.internals import concatenate_block_managers from pandas.core.sorting import is_int64_overflow_possible if TYPE_CHECKING: @@ -92,9 +93,7 @@ def merge( merge.__doc__ = _merge_doc % "\nleft : DataFrame" -def _groupby_and_merge( - by, on, left, right: "DataFrame", _merge_pieces, check_duplicates: bool = True -): +def _groupby_and_merge(by, on, left: "DataFrame", right: "DataFrame", merge_pieces): """ groupby & merge; we are always performing a left-by type operation @@ -102,11 +101,9 @@ def _groupby_and_merge( ---------- by: field to group on: duplicates field - left: left frame - right: right frame - _merge_pieces: function for merging - check_duplicates: bool, default True - should we check & clean duplicates + left: DataFrame + right: DataFrame + merge_pieces: function for merging """ pieces = [] if not isinstance(by, (list, tuple)): @@ -118,18 +115,6 @@ def _groupby_and_merge( # if we can groupby the rhs # then we can get vastly better perf - # we will check & remove duplicates if indicated - if check_duplicates: - if on is None: - on = [] - elif not isinstance(on, (list, tuple)): - on = [on] - - if right.duplicated(by + on).any(): - _right = right.drop_duplicates(by + on, keep="last") - # TODO: use overload to refine return type of drop_duplicates - assert _right is not None # needed for mypy - right = _right try: rby = right.groupby(by, sort=False) except KeyError: @@ -151,16 +136,13 @@ def _groupby_and_merge( pieces.append(merged) continue - merged = _merge_pieces(lhs, rhs) + merged = merge_pieces(lhs, rhs) # make sure join keys are in the merged - # TODO, should _merge_pieces do this? + # TODO, should merge_pieces do this? for k in by: - try: - if k in merged: - merged[k] = key - except KeyError: - pass + if k in merged: + merged[k] = key pieces.append(merged) @@ -235,12 +217,19 @@ def merge_ordered( See Also -------- - merge - merge_asof + merge : Merge with a database-style join. + merge_asof : Merge on nearest keys. Examples -------- - >>> A + >>> df1 = pd.DataFrame( + ... { + ... "key": ["a", "c", "e", "a", "c", "e"], + ... "lvalue": [1, 2, 3, 1, 2, 3], + ... "group": ["a", "a", "a", "b", "b", "b"] + ... } + ... ) + >>> df1 key lvalue group 0 a 1 a 1 c 2 a @@ -249,24 +238,25 @@ def merge_ordered( 4 c 2 b 5 e 3 b - >>> B - Key rvalue - 0 b 1 - 1 c 2 - 2 d 3 - - >>> merge_ordered(A, B, fill_method='ffill', left_by='group') - group key lvalue rvalue - 0 a a 1 NaN - 1 a b 1 1.0 - 2 a c 2 2.0 - 3 a d 2 3.0 - 4 a e 3 3.0 - 5 b a 1 NaN - 6 b b 1 1.0 - 7 b c 2 2.0 - 8 b d 2 3.0 - 9 b e 3 3.0 + >>> df2 = pd.DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]}) + >>> df2 + key rvalue + 0 b 1 + 1 c 2 + 2 d 3 + + >>> merge_ordered(df1, df2, fill_method="ffill", left_by="group") + key lvalue group rvalue + 0 a 1 a NaN + 1 b 1 a 1.0 + 2 c 2 a 2.0 + 3 d 2 a 3.0 + 4 e 3 a 3.0 + 5 a 1 b NaN + 6 b 1 b 1.0 + 7 c 2 b 2.0 + 8 d 2 b 3.0 + 9 e 3 b 3.0 """ def _merger(x, y): @@ -287,16 +277,11 @@ def _merger(x, y): raise ValueError("Can only group either left or right frames") elif left_by is not None: result, _ = _groupby_and_merge( - left_by, on, left, right, lambda x, y: _merger(x, y), check_duplicates=False + left_by, on, left, right, lambda x, y: _merger(x, y) ) elif right_by is not None: result, _ = _groupby_and_merge( - right_by, - on, - right, - left, - lambda x, y: _merger(y, x), - check_duplicates=False, + right_by, on, right, left, lambda x, y: _merger(y, x) ) else: result = _merger(left, right) @@ -320,10 +305,10 @@ def merge_asof( direction: str = "backward", ) -> "DataFrame": """ - Perform an asof merge. This is similar to a left-join except that we - match on nearest key rather than equal keys. + Perform an asof merge. - Both DataFrames must be sorted by the key. + This is similar to a left-join except that we match on nearest + key rather than equal keys. Both DataFrames must be sorted by the key. For each row in the left DataFrame: @@ -387,20 +372,19 @@ def merge_asof( See Also -------- - merge - merge_ordered + merge : Merge with a database-style join. + merge_ordered : Merge with optional filling/interpolation. Examples -------- - >>> left = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c']}) + >>> left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) >>> left a left_val 0 1 a 1 5 b 2 10 c - >>> right = pd.DataFrame({'a': [1, 2, 3, 6, 7], - ... 'right_val': [1, 2, 3, 6, 7]}) + >>> right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) >>> right a right_val 0 1 1 @@ -409,25 +393,25 @@ def merge_asof( 3 6 6 4 7 7 - >>> pd.merge_asof(left, right, on='a') + >>> pd.merge_asof(left, right, on="a") a left_val right_val 0 1 a 1 1 5 b 3 2 10 c 7 - >>> pd.merge_asof(left, right, on='a', allow_exact_matches=False) + >>> pd.merge_asof(left, right, on="a", allow_exact_matches=False) a left_val right_val 0 1 a NaN 1 5 b 3.0 2 10 c 7.0 - >>> pd.merge_asof(left, right, on='a', direction='forward') + >>> pd.merge_asof(left, right, on="a", direction="forward") a left_val right_val 0 1 a 1.0 1 5 b 6.0 2 10 c NaN - >>> pd.merge_asof(left, right, on='a', direction='nearest') + >>> pd.merge_asof(left, right, on="a", direction="nearest") a left_val right_val 0 1 a 1 1 5 b 6 @@ -435,15 +419,14 @@ def merge_asof( We can use indexed DataFrames as well. - >>> left = pd.DataFrame({'left_val': ['a', 'b', 'c']}, index=[1, 5, 10]) + >>> left = pd.DataFrame({"left_val": ["a", "b", "c"]}, index=[1, 5, 10]) >>> left left_val 1 a 5 b 10 c - >>> right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7]}, - ... index=[1, 2, 3, 6, 7]) + >>> right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7]) >>> right right_val 1 1 @@ -460,6 +443,32 @@ def merge_asof( Here is a real-world times-series example + >>> quotes = pd.DataFrame( + ... { + ... "time": [ + ... pd.Timestamp("2016-05-25 13:30:00.023"), + ... pd.Timestamp("2016-05-25 13:30:00.023"), + ... pd.Timestamp("2016-05-25 13:30:00.030"), + ... pd.Timestamp("2016-05-25 13:30:00.041"), + ... pd.Timestamp("2016-05-25 13:30:00.048"), + ... pd.Timestamp("2016-05-25 13:30:00.049"), + ... pd.Timestamp("2016-05-25 13:30:00.072"), + ... pd.Timestamp("2016-05-25 13:30:00.075") + ... ], + ... "ticker": [ + ... "GOOG", + ... "MSFT", + ... "MSFT", + ... "MSFT", + ... "GOOG", + ... "AAPL", + ... "GOOG", + ... "MSFT" + ... ], + ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], + ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03] + ... } + ... ) >>> quotes time ticker bid ask 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93 @@ -471,6 +480,20 @@ def merge_asof( 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03 + >>> trades = pd.DataFrame( + ... { + ... "time": [ + ... pd.Timestamp("2016-05-25 13:30:00.023"), + ... pd.Timestamp("2016-05-25 13:30:00.038"), + ... pd.Timestamp("2016-05-25 13:30:00.048"), + ... pd.Timestamp("2016-05-25 13:30:00.048"), + ... pd.Timestamp("2016-05-25 13:30:00.048") + ... ], + ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + ... "price": [51.95, 51.95, 720.77, 720.92, 98.0], + ... "quantity": [75, 155, 100, 100, 100] + ... } + ... ) >>> trades time ticker price quantity 0 2016-05-25 13:30:00.023 MSFT 51.95 75 @@ -481,9 +504,7 @@ def merge_asof( By default we are taking the asof of the quotes - >>> pd.merge_asof(trades, quotes, - ... on='time', - ... by='ticker') + >>> pd.merge_asof(trades, quotes, on="time", by="ticker") time ticker price quantity bid ask 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 @@ -493,10 +514,9 @@ def merge_asof( We only asof within 2ms between the quote time and the trade time - >>> pd.merge_asof(trades, quotes, - ... on='time', - ... by='ticker', - ... tolerance=pd.Timedelta('2ms')) + >>> pd.merge_asof( + ... trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms") + ... ) time ticker price quantity bid ask 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN @@ -508,11 +528,14 @@ def merge_asof( and we exclude exact matches on time. However *prior* data will propagate forward - >>> pd.merge_asof(trades, quotes, - ... on='time', - ... by='ticker', - ... tolerance=pd.Timedelta('10ms'), - ... allow_exact_matches=False) + >>> pd.merge_asof( + ... trades, + ... quotes, + ... on="time", + ... by="ticker", + ... tolerance=pd.Timedelta("10ms"), + ... allow_exact_matches=False + ... ) time ticker price quantity bid ask 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 @@ -572,7 +595,11 @@ def __init__( self.left = self.orig_left = _left self.right = self.orig_right = _right self.how = how - self.axis = axis + + # bm_axis -> the axis on the BlockManager + self.bm_axis = axis + # axis --> the axis on the Series/DataFrame + self.axis = 1 - axis if self.left.ndim == 2 else 0 self.on = com.maybe_make_list(on) self.left_on = com.maybe_make_list(left_on) @@ -640,18 +667,17 @@ def get_result(self): join_index, left_indexer, right_indexer = self._get_join_info() - ldata, rdata = self.left._data, self.right._data lsuf, rsuf = self.suffixes llabels, rlabels = _items_overlap_with_suffix( - ldata.items, lsuf, rdata.items, rsuf + self.left._info_axis, lsuf, self.right._info_axis, rsuf ) lindexers = {1: left_indexer} if left_indexer is not None else {} rindexers = {1: right_indexer} if right_indexer is not None else {} result_data = concatenate_block_managers( - [(ldata, lindexers), (rdata, rindexers)], + [(self.left._mgr, lindexers), (self.right._mgr, rindexers)], axes=[llabels.append(rlabels), join_index], concat_axis=0, copy=self.copy, @@ -840,8 +866,8 @@ def _get_join_indexers(self): ) def _get_join_info(self): - left_ax = self.left._data.axes[self.axis] - right_ax = self.right._data.axes[self.axis] + left_ax = self.left.axes[self.axis] + right_ax = self.right.axes[self.axis] if self.left_index and self.right_index and self.how != "asof": join_index, left_indexer, right_indexer = left_ax.join( @@ -1294,7 +1320,7 @@ def _get_join_indexers( # get left & right join labels and num. of levels at each location mapped = ( - _factorize_keys(left_keys[n], right_keys[n], sort=sort) + _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how) for n in range(len(left_keys)) ) zipped = zip(*mapped) @@ -1306,13 +1332,18 @@ def _get_join_indexers( # factorize keys to a dense i8 space # `count` is the num. of unique keys # set(lkey) | set(rkey) == range(count) - lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) + lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how) # preserve left frame order if how == 'left' and sort == False kwargs = copy.copy(kwargs) if how == "left": kwargs["sort"] = sort - join_func = _join_functions[how] + join_func = { + "inner": libjoin.inner_join, + "left": libjoin.left_outer_join, + "right": _right_outer_join, + "outer": libjoin.full_outer_join, + }[how] return join_func(lkey, rkey, count, **kwargs) @@ -1365,7 +1396,7 @@ def _convert_to_mulitindex(index) -> MultiIndex: if isinstance(index, MultiIndex): return index else: - return MultiIndex.from_arrays([index.values], names=[index.name]) + return MultiIndex.from_arrays([index._values], names=[index.name]) # For multi-multi joins with one overlapping level, # the returned index if of type Index @@ -1449,12 +1480,10 @@ def __init__( def get_result(self): join_index, left_indexer, right_indexer = self._get_join_info() - # this is a bit kludgy - ldata, rdata = self.left._data, self.right._data lsuf, rsuf = self.suffixes llabels, rlabels = _items_overlap_with_suffix( - ldata.items, lsuf, rdata.items, rsuf + self.left._info_axis, lsuf, self.right._info_axis, rsuf ) if self.fill_method == "ffill": @@ -1468,7 +1497,7 @@ def get_result(self): rindexers = {1: right_join_indexer} if right_join_indexer is not None else {} result_data = concatenate_block_managers( - [(ldata, lindexers), (rdata, rindexers)], + [(self.left._mgr, lindexers), (self.right._mgr, rindexers)], axes=[llabels.append(rlabels), join_index], concat_axis=0, copy=self.copy, @@ -1600,11 +1629,6 @@ def _validate_specification(self): if self.direction not in ["backward", "forward", "nearest"]: raise MergeError(f"direction invalid: {self.direction}") - @property - def _asof_key(self): - """ This is our asof key, the 'on' """ - return self.left_on[-1] - def _get_merge_keys(self): # note this function has side effects @@ -1680,10 +1704,13 @@ def _get_merge_keys(self): def _get_join_indexers(self): """ return the join indexers """ - def flip(xs): + def flip(xs) -> np.ndarray: """ unlike np.transpose, this returns an array of tuples """ xs = [ - x if not is_extension_array_dtype(x) else x._ndarray_values for x in xs + x + if not is_extension_array_dtype(x) + else extract_array(x)._values_for_argsort() + for x in xs ] labels = list(string.ascii_lowercase[: len(xs)]) dtypes = [x.dtype for x in xs] @@ -1692,10 +1719,10 @@ def flip(xs): # values to compare left_values = ( - self.left.index.values if self.left_index else self.left_join_keys[-1] + self.left.index._values if self.left_index else self.left_join_keys[-1] ) right_values = ( - self.right.index.values if self.right_index else self.right_join_keys[-1] + self.right.index._values if self.right_index else self.right_join_keys[-1] ) tolerance = self.tolerance @@ -1842,38 +1869,85 @@ def _right_outer_join(x, y, max_groups): return left_indexer, right_indexer -_join_functions = { - "inner": libjoin.inner_join, - "left": libjoin.left_outer_join, - "right": _right_outer_join, - "outer": libjoin.full_outer_join, -} +def _factorize_keys( + lk: ArrayLike, rk: ArrayLike, sort: bool = True, how: str = "inner" +) -> Tuple[np.array, np.array, int]: + """ + Encode left and right keys as enumerated types. + + This is used to get the join indexers to be used when merging DataFrames. + Parameters + ---------- + lk : array-like + Left key. + rk : array-like + Right key. + sort : bool, defaults to True + If True, the encoding is done such that the unique elements in the + keys are sorted. + how : {‘left’, ‘right’, ‘outer’, ‘inner’}, default ‘inner’ + Type of merge. -def _factorize_keys(lk, rk, sort=True): + Returns + ------- + array + Left (resp. right if called with `key='right'`) labels, as enumerated type. + array + Right (resp. left if called with `key='right'`) labels, as enumerated type. + int + Number of unique elements in union of left and right labels. + + See Also + -------- + merge : Merge DataFrame or named Series objects + with a database-style join. + algorithms.factorize : Encode the object as an enumerated type + or categorical variable. + + Examples + -------- + >>> lk = np.array(["a", "c", "b"]) + >>> rk = np.array(["a", "c"]) + + Here, the unique values are `'a', 'b', 'c'`. With the default + `sort=True`, the encoding will be `{0: 'a', 1: 'b', 2: 'c'}`: + + >>> pd.core.reshape.merge._factorize_keys(lk, rk) + (array([0, 2, 1]), array([0, 2]), 3) + + With the `sort=False`, the encoding will correspond to the order + in which the unique elements first appear: `{0: 'a', 1: 'c', 2: 'b'}`: + + >>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False) + (array([0, 1, 2]), array([0, 1]), 3) + """ # Some pre-processing for non-ndarray lk / rk - if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk): - lk = getattr(lk, "_values", lk)._data - rk = getattr(rk, "_values", rk)._data + lk = extract_array(lk, extract_numpy=True) + rk = extract_array(rk, extract_numpy=True) + + if is_datetime64tz_dtype(lk.dtype) and is_datetime64tz_dtype(rk.dtype): + # Extract the ndarray (UTC-localized) values + # Note: we dont need the dtypes to match, as these can still be compared + lk, _ = lk._values_for_factorize() + rk, _ = rk._values_for_factorize() elif ( - is_categorical_dtype(lk) and is_categorical_dtype(rk) and lk.is_dtype_equal(rk) + is_categorical_dtype(lk) and is_categorical_dtype(rk) and is_dtype_equal(lk, rk) ): + assert isinstance(lk, Categorical) + assert isinstance(rk, Categorical) if lk.categories.equals(rk.categories): # if we exactly match in categories, allow us to factorize on codes rk = rk.codes else: # Same categories in different orders -> recode - rk = _recode_for_categories(rk.codes, rk.categories, lk.categories) + rk = recode_for_categories(rk.codes, rk.categories, lk.categories) lk = ensure_int64(lk.codes) rk = ensure_int64(rk) - elif ( - is_extension_array_dtype(lk.dtype) - and is_extension_array_dtype(rk.dtype) - and lk.dtype == rk.dtype - ): + elif is_extension_array_dtype(lk.dtype) and is_dtype_equal(lk.dtype, rk.dtype): lk, _ = lk._values_for_factorize() rk, _ = rk._values_for_factorize() @@ -1881,15 +1955,15 @@ def _factorize_keys(lk, rk, sort=True): # GH#23917 TODO: needs tests for case where lk is integer-dtype # and rk is datetime-dtype klass = libhashtable.Int64Factorizer - lk = ensure_int64(com.values_from_object(lk)) - rk = ensure_int64(com.values_from_object(rk)) - elif issubclass(lk.dtype.type, (np.timedelta64, np.datetime64)) and issubclass( - rk.dtype.type, (np.timedelta64, np.datetime64) - ): + lk = ensure_int64(np.asarray(lk)) + rk = ensure_int64(np.asarray(rk)) + + elif needs_i8_conversion(lk.dtype) and is_dtype_equal(lk.dtype, rk.dtype): # GH#23917 TODO: Needs tests for non-matching dtypes klass = libhashtable.Int64Factorizer - lk = ensure_int64(com.values_from_object(lk)) - rk = ensure_int64(com.values_from_object(rk)) + lk = ensure_int64(np.asarray(lk, dtype=np.int64)) + rk = ensure_int64(np.asarray(rk, dtype=np.int64)) + else: klass = libhashtable.Factorizer lk = ensure_object(lk) @@ -1919,13 +1993,12 @@ def _factorize_keys(lk, rk, sort=True): np.putmask(rlab, rmask, count) count += 1 + if how == "right": + return rlab, llab, count return llab, rlab, count def _sort_labels(uniques: np.ndarray, left, right): - if not isinstance(uniques, np.ndarray): - # tuplesafe - uniques = Index(uniques).values llength = len(left) labels = np.concatenate([left, right]) @@ -2028,4 +2101,4 @@ def renamer(x, suffix): lrenamer = partial(renamer, suffix=lsuffix) rrenamer = partial(renamer, suffix=rsuffix) - return (_transform_index(left, lrenamer), _transform_index(right, rrenamer)) + return (left._transform_index(lrenamer), right._transform_index(rrenamer)) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index b04e4e1ac4d48..17473ac26dfd6 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -150,7 +150,9 @@ def pivot_table( table = table.sort_index(axis=1) if fill_value is not None: - table = table._ensure_type(table.fillna(fill_value, downcast="infer")) + _table = table.fillna(fill_value, downcast="infer") + assert _table is not None # needed for mypy + table = _table if margins: if dropna: @@ -454,10 +456,10 @@ def pivot(data: "DataFrame", index=None, columns=None, values=None) -> "DataFram if is_list_like(values) and not isinstance(values, tuple): # Exclude tuple because it is seen as a single column name indexed = data._constructor( - data[values].values, index=index, columns=values + data[values]._values, index=index, columns=values ) else: - indexed = data._constructor_sliced(data[values].values, index=index) + indexed = data._constructor_sliced(data[values]._values, index=index) return indexed.unstack(columns) @@ -498,9 +500,6 @@ def crosstab( margins_name : str, default 'All' Name of the row/column that will contain the totals when margins is True. - - .. versionadded:: 0.21.0 - dropna : bool, default True Do not include columns whose entries are all NaN. normalize : bool, {'all', 'index', 'columns'}, or {0,1}, default False @@ -631,8 +630,8 @@ def _normalize(table, normalize, margins: bool, margins_name="All"): axis_subs = {0: "index", 1: "columns"} try: normalize = axis_subs[normalize] - except KeyError: - raise ValueError("Not a valid normalize argument") + except KeyError as err: + raise ValueError("Not a valid normalize argument") from err if margins is False: @@ -647,8 +646,8 @@ def _normalize(table, normalize, margins: bool, margins_name="All"): try: f = normalizers[normalize] - except KeyError: - raise ValueError("Not a valid normalize argument") + except KeyError as err: + raise ValueError("Not a valid normalize argument") from err table = f(table) table = table.fillna(0) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 359e5b956f8a5..882e3e0a649cc 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1,4 +1,3 @@ -from functools import partial import itertools from typing import List, Optional, Union @@ -7,6 +6,7 @@ import pandas._libs.algos as libalgos import pandas._libs.reshape as libreshape from pandas._libs.sparse import IntIndex +from pandas.util._decorators import cache_readonly from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.common import ( @@ -24,7 +24,6 @@ import pandas.core.algorithms as algos from pandas.core.arrays import SparseArray from pandas.core.arrays.categorical import factorize_from_iterable -from pandas.core.construction import extract_array from pandas.core.frame import DataFrame from pandas.core.indexes.api import Index, MultiIndex from pandas.core.series import Series @@ -42,14 +41,10 @@ class _Unstacker: Parameters ---------- - values : ndarray - Values of DataFrame to "Unstack" index : object Pandas ``Index`` level : int or str, default last level Level to "unstack". Accepts a name for the level. - value_columns : Index, optional - Pandas ``Index`` or ``MultiIndex`` object if unstacking a DataFrame fill_value : scalar, optional Default value to fill in missing values if subgroups do not have the same set of labels. By default, missing values will be replaced with @@ -88,28 +83,13 @@ class _Unstacker: """ def __init__( - self, - values: np.ndarray, - index, - level=-1, - value_columns=None, - fill_value=None, - constructor=None, + self, index, level=-1, constructor=None, ): - if values.ndim == 1: - values = values[:, np.newaxis] - self.values = values - self.value_columns = value_columns - self.fill_value = fill_value - if constructor is None: constructor = DataFrame self.constructor = constructor - if value_columns is None and values.shape[1] != 1: # pragma: no cover - raise ValueError("must pass column labels for multi-column data") - self.index = index.remove_unused_levels() self.level = self.index._get_level_number(level) @@ -117,6 +97,7 @@ def __init__( # when index includes `nan`, need to lift levels/strides by 1 self.lift = 1 if -1 in self.index.codes[self.level] else 0 + # Note: the "pop" below alters these in-place. self.new_index_levels = list(self.index.levels) self.new_index_names = list(self.index.names) @@ -137,10 +118,10 @@ def __init__( if num_rows > 0 and num_columns > 0 and num_cells <= 0: raise ValueError("Unstacked DataFrame is too big, causing int32 overflow") - self._make_sorted_values_labels() self._make_selectors() - def _make_sorted_values_labels(self): + @cache_readonly + def _indexer_and_to_sort(self): v = self.level codes = list(self.index.codes) @@ -154,8 +135,18 @@ def _make_sorted_values_labels(self): indexer = libalgos.groupsort_indexer(comp_index, ngroups)[0] indexer = ensure_platform_int(indexer) - self.sorted_values = algos.take_nd(self.values, indexer, axis=0) - self.sorted_labels = [l.take(indexer) for l in to_sort] + return indexer, to_sort + + @cache_readonly + def sorted_labels(self): + indexer, to_sort = self._indexer_and_to_sort + return [l.take(indexer) for l in to_sort] + + def _make_sorted_values(self, values: np.ndarray) -> np.ndarray: + indexer, _ = self._indexer_and_to_sort + + sorted_values = algos.take_nd(values, indexer, axis=0) + return sorted_values def _make_selectors(self): new_levels = self.new_index_levels @@ -183,15 +174,26 @@ def _make_selectors(self): self.unique_groups = obs_ids self.compressor = comp_index.searchsorted(np.arange(ngroups)) - def get_result(self): - values, _ = self.get_new_values() - columns = self.get_new_columns() - index = self.get_new_index() + def get_result(self, values, value_columns, fill_value): + + if values.ndim == 1: + values = values[:, np.newaxis] + + if value_columns is None and values.shape[1] != 1: # pragma: no cover + raise ValueError("must pass column labels for multi-column data") + + values, _ = self.get_new_values(values, fill_value) + columns = self.get_new_columns(value_columns) + index = self.new_index return self.constructor(values, index=index, columns=columns) - def get_new_values(self): - values = self.values + def get_new_values(self, values, fill_value=None): + + if values.ndim == 1: + values = values[:, np.newaxis] + + sorted_values = self._make_sorted_values(values) # place the values length, width = self.full_shape @@ -203,8 +205,11 @@ def get_new_values(self): # we can simply reshape if we don't have a mask if mask_all and len(values): + # TODO: Under what circumstances can we rely on sorted_values + # matching values? When that holds, we can slice instead + # of take (in particular for EAs) new_values = ( - self.sorted_values.reshape(length, width, stride) + sorted_values.reshape(length, width, stride) .swapaxes(1, 2) .reshape(result_shape) ) @@ -216,14 +221,13 @@ def get_new_values(self): dtype = values.dtype new_values = np.empty(result_shape, dtype=dtype) else: - dtype, fill_value = maybe_promote(values.dtype, self.fill_value) + dtype, fill_value = maybe_promote(values.dtype, fill_value) new_values = np.empty(result_shape, dtype=dtype) new_values.fill(fill_value) new_mask = np.zeros(result_shape, dtype=bool) name = np.dtype(dtype).name - sorted_values = self.sorted_values # we need to convert to a basic dtype # and possibly coerce an input to our output dtype @@ -254,8 +258,8 @@ def get_new_values(self): return new_values, new_mask - def get_new_columns(self): - if self.value_columns is None: + def get_new_columns(self, value_columns): + if value_columns is None: if self.lift == 0: return self.removed_level._shallow_copy(name=self.removed_name) @@ -263,16 +267,16 @@ def get_new_columns(self): return lev.rename(self.removed_name) stride = len(self.removed_level) + self.lift - width = len(self.value_columns) + width = len(value_columns) propagator = np.repeat(np.arange(width), stride) - if isinstance(self.value_columns, MultiIndex): - new_levels = self.value_columns.levels + (self.removed_level_full,) - new_names = self.value_columns.names + (self.removed_name,) + if isinstance(value_columns, MultiIndex): + new_levels = value_columns.levels + (self.removed_level_full,) + new_names = value_columns.names + (self.removed_name,) - new_codes = [lab.take(propagator) for lab in self.value_columns.codes] + new_codes = [lab.take(propagator) for lab in value_columns.codes] else: - new_levels = [self.value_columns, self.removed_level_full] - new_names = [self.value_columns.name, self.removed_name] + new_levels = [value_columns, self.removed_level_full] + new_names = [value_columns.name, self.removed_name] new_codes = [propagator] # The two indices differ only if the unstacked level had unused items: @@ -291,7 +295,9 @@ def get_new_columns(self): levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False ) - def get_new_index(self): + @cache_readonly + def new_index(self): + # Does not depend on values or value_columns result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]] # construct the new index @@ -338,7 +344,7 @@ def _unstack_multiple(data, clocs, fill_value=None): comp_ids, obs_ids = compress_group_index(group_index, sort=False) recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False) - if rlocs == []: + if not rlocs: # Everything is in clocs, so the dummy df has a regular index dummy_index = Index(obs_ids, name="__placeholder__") else: @@ -363,7 +369,7 @@ def _unstack_multiple(data, clocs, fill_value=None): for i in range(len(clocs)): val = clocs[i] result = result.unstack(val, fill_value=fill_value) - clocs = [v if i > v else v - 1 for v in clocs] + clocs = [v if v < val else v - 1 for v in clocs] return result @@ -409,7 +415,7 @@ def unstack(obj, level, fill_value=None): level = obj.index._get_level_number(level) if isinstance(obj, DataFrame): - if isinstance(obj.index, MultiIndex): + if isinstance(obj.index, MultiIndex) or not obj._can_fast_transpose: return _unstack_frame(obj, level, fill_value=fill_value) else: return obj.T.stack(dropna=False) @@ -417,31 +423,22 @@ def unstack(obj, level, fill_value=None): if is_extension_array_dtype(obj.dtype): return _unstack_extension_series(obj, level, fill_value) unstacker = _Unstacker( - obj.values, - obj.index, - level=level, - fill_value=fill_value, - constructor=obj._constructor_expanddim, + obj.index, level=level, constructor=obj._constructor_expanddim, + ) + return unstacker.get_result( + obj.values, value_columns=None, fill_value=fill_value ) - return unstacker.get_result() def _unstack_frame(obj, level, fill_value=None): - if obj._is_mixed_type: - unstacker = partial( - _Unstacker, index=obj.index, level=level, fill_value=fill_value - ) - blocks = obj._data.unstack(unstacker, fill_value=fill_value) - return obj._constructor(blocks) + if not obj._can_fast_transpose: + unstacker = _Unstacker(obj.index, level=level) + mgr = obj._mgr.unstack(unstacker, fill_value=fill_value) + return obj._constructor(mgr) else: return _Unstacker( - obj.values, - obj.index, - level=level, - value_columns=obj.columns, - fill_value=fill_value, - constructor=obj._constructor, - ).get_result() + obj.index, level=level, constructor=obj._constructor, + ).get_result(obj._values, value_columns=obj.columns, fill_value=fill_value) def _unstack_extension_series(series, level, fill_value): @@ -467,31 +464,10 @@ def _unstack_extension_series(series, level, fill_value): Each column of the DataFrame will have the same dtype as the input Series. """ - # Implementation note: the basic idea is to - # 1. Do a regular unstack on a dummy array of integers - # 2. Followup with a columnwise take. - # We use the dummy take to discover newly-created missing values - # introduced by the reshape. - from pandas.core.reshape.concat import concat - - dummy_arr = np.arange(len(series)) - # fill_value=-1, since we will do a series.values.take later - result = _Unstacker( - dummy_arr, series.index, level=level, fill_value=-1 - ).get_result() - - out = [] - values = extract_array(series, extract_numpy=False) - - for col, indices in result.items(): - out.append( - Series( - values.take(indices.values, allow_fill=True, fill_value=fill_value), - name=col, - index=result.index, - ) - ) - return concat(out, axis="columns", copy=False, keys=result.columns) + # Defer to the logic in ExtensionBlock._unstack + df = series.to_frame() + result = df.unstack(level=level, fill_value=fill_value) + return result.droplevel(level=0, axis=1) def stack(frame, level=-1, dropna=True): @@ -541,9 +517,9 @@ def factorize(index): ) if frame._is_homogeneous_type: - # For homogeneous EAs, frame.values will coerce to object. So + # For homogeneous EAs, frame._values will coerce to object. So # we concatenate instead. - dtypes = list(frame.dtypes.values) + dtypes = list(frame.dtypes._values) dtype = dtypes[0] if is_extension_array_dtype(dtype): @@ -554,11 +530,11 @@ def factorize(index): new_values = _reorder_for_extension_array_stack(new_values, N, K) else: # homogeneous, non-EA - new_values = frame.values.ravel() + new_values = frame._values.ravel() else: # non-homogeneous - new_values = frame.values.ravel() + new_values = frame._values.ravel() if dropna: mask = notna(new_values) @@ -985,12 +961,7 @@ def get_empty_frame(data) -> DataFrame: if prefix is None: dummy_cols = levels else: - - # PY2 embedded unicode, gh-22084 - def _make_col_name(prefix, prefix_sep, level) -> str: - return f"{prefix}{prefix_sep}{level}" - - dummy_cols = [_make_col_name(prefix, prefix_sep, level) for level in levels] + dummy_cols = [f"{prefix}{prefix_sep}{level}" for level in levels] index: Optional[Index] if isinstance(data, Series): diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 86417faf6cd11..66c2f5c9b927f 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -7,7 +7,7 @@ from pandas._libs.lib import infer_dtype from pandas.core.dtypes.common import ( - _NS_DTYPE, + DT64NS_DTYPE, ensure_int64, is_bool_dtype, is_categorical_dtype, @@ -171,24 +171,26 @@ def cut( ... index=['a', 'b', 'c', 'd', 'e']) >>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False) ... # doctest: +ELLIPSIS - (a 0.0 - b 1.0 - c 2.0 - d 3.0 - e 4.0 - dtype: float64, array([0, 2, 4, 6, 8])) + (a 1.0 + b 2.0 + c 3.0 + d 4.0 + e NaN + dtype: float64, + array([ 0, 2, 4, 6, 8, 10])) Use `drop` optional when bins is not unique >>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True, ... right=False, duplicates='drop') ... # doctest: +ELLIPSIS - (a 0.0 - b 1.0 - c 2.0 + (a 1.0 + b 2.0 + c 3.0 d 3.0 - e 3.0 - dtype: float64, array([0, 2, 4, 6, 8])) + e NaN + dtype: float64, + array([ 0, 2, 4, 6, 10])) Passing an IntervalIndex for `bins` results in those categories exactly. Notice that values not covered by the IntervalIndex are set to NaN. 0 @@ -197,7 +199,7 @@ def cut( >>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins) - [NaN, (0, 1], NaN, (2, 3], (4, 5]] + [NaN, (0.0, 1.0], NaN, (2.0, 3.0], (4.0, 5.0]] Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]] """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 @@ -245,7 +247,7 @@ def cut( else: if is_datetime64tz_dtype(bins): - bins = np.asarray(bins, dtype=_NS_DTYPE) + bins = np.asarray(bins, dtype=DT64NS_DTYPE) else: bins = np.asarray(bins) bins = _convert_bin_to_numeric_type(bins, dtype) @@ -286,7 +288,7 @@ def qcut( Parameters ---------- x : 1d ndarray or Series - q : int or list-like of int + q : int or list-like of float Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles. labels : array or False, default None diff --git a/pandas/core/reshape/util.py b/pandas/core/reshape/util.py index d8652c9b4fac9..6949270317f7c 100644 --- a/pandas/core/reshape/util.py +++ b/pandas/core/reshape/util.py @@ -2,8 +2,6 @@ from pandas.core.dtypes.common import is_list_like -import pandas.core.common as com - def cartesian_product(X): """ @@ -21,8 +19,7 @@ def cartesian_product(X): Examples -------- >>> cartesian_product([list('ABC'), [1, 2]]) - [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'), - array([1, 2, 1, 2, 1, 2])] + [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype=' Type["DataFrame"]: # types @property def _can_hold_na(self): - return self._data._can_hold_na + return self._mgr._can_hold_na _index = None - def _set_axis(self, axis, labels, fastpath: bool = False) -> None: + def _set_axis(self, axis: int, labels, fastpath: bool = False) -> None: """ Override generic, we want to set the _typ here. + + This is called from the cython code when we set the `index` attribute + directly, e.g. `series.index = [1, 2, 3]`. """ if not fastpath: labels = ensure_index(labels) @@ -405,7 +418,7 @@ def _set_axis(self, axis, labels, fastpath: bool = False) -> None: labels = DatetimeIndex(labels) # need to set here because we changed the index if fastpath: - self._data.set_axis(axis, labels) + self._mgr.set_axis(axis, labels) except (tslibs.OutOfBoundsDatetime, ValueError): # labels may exceeds datetime bounds, # or not be a DatetimeIndex @@ -413,33 +426,77 @@ def _set_axis(self, axis, labels, fastpath: bool = False) -> None: object.__setattr__(self, "_index", labels) if not fastpath: - self._data.set_axis(axis, labels) - - def _update_inplace(self, result, **kwargs): - # we want to call the generic version and not the IndexOpsMixin - return generic.NDFrame._update_inplace(self, result, **kwargs) + # The ensure_index call above ensures we have an Index object + self._mgr.set_axis(axis, labels) # ndarray compatibility @property - def dtype(self): + def dtype(self) -> DtypeObj: """ Return the dtype object of the underlying data. """ - return self._data.dtype + return self._mgr.dtype @property - def dtypes(self): + def dtypes(self) -> DtypeObj: """ Return the dtype object of the underlying data. """ - return self._data.dtype + # DataFrame compatibility + return self.dtype @property - def name(self) -> Optional[Hashable]: + def name(self) -> Label: + """ + Return the name of the Series. + + The name of a Series becomes its index or column name if it is used + to form a DataFrame. It is also used whenever displaying the Series + using the interpreter. + + Returns + ------- + label (hashable object) + The name of the Series, also the column name if part of a DataFrame. + + See Also + -------- + Series.rename : Sets the Series name when given a scalar input. + Index.name : Corresponding Index property. + + Examples + -------- + The Series name can be set initially when calling the constructor. + + >>> s = pd.Series([1, 2, 3], dtype=np.int64, name='Numbers') + >>> s + 0 1 + 1 2 + 2 3 + Name: Numbers, dtype: int64 + >>> s.name = "Integers" + >>> s + 0 1 + 1 2 + 2 3 + Name: Integers, dtype: int64 + + The name of a Series within a DataFrame is its column name. + + >>> df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], + ... columns=["Odd Numbers", "Even Numbers"]) + >>> df + Odd Numbers Even Numbers + 0 1 2 + 1 3 4 + 2 5 6 + >>> df["Even Numbers"].name + 'Even Numbers' + """ return self._name @name.setter - def name(self, value: Optional[Hashable]) -> None: + def name(self, value: Label) -> None: if not is_hashable(value): raise TypeError("Series.name must be a hashable type") object.__setattr__(self, "_name", value) @@ -484,7 +541,7 @@ def values(self): '2013-01-02T05:00:00.000000000', '2013-01-03T05:00:00.000000000'], dtype='datetime64[ns]') """ - return self._data.external_values() + return self._mgr.external_values() @property def _values(self): @@ -505,40 +562,25 @@ def _values(self): timedelta64 dtypes), while ``.array`` ensures to always return an ExtensionArray. - Differs from ``._ndarray_values``, as that ensures to always return a - numpy array (it will call ``_ndarray_values`` on the ExtensionArray, if - the Series was backed by an ExtensionArray). - Overview: - dtype | values | _values | array | _ndarray_values | - ----------- | ------------- | ------------- | ------------- | --------------- | - Numeric | ndarray | ndarray | PandasArray | ndarray | - Category | Categorical | Categorical | Categorical | ndarray[int] | - dt64[ns] | ndarray[M8ns] | DatetimeArray | DatetimeArray | ndarray[M8ns] | - dt64[ns tz] | ndarray[M8ns] | DatetimeArray | DatetimeArray | ndarray[M8ns] | - td64[ns] | ndarray[m8ns] | TimedeltaArray| ndarray[m8ns] | ndarray[m8ns] | - Period | ndarray[obj] | PeriodArray | PeriodArray | ndarray[int] | - Nullable | EA | EA | EA | ndarray | + dtype | values | _values | array | + ----------- | ------------- | ------------- | ------------- | + Numeric | ndarray | ndarray | PandasArray | + Category | Categorical | Categorical | Categorical | + dt64[ns] | ndarray[M8ns] | DatetimeArray | DatetimeArray | + dt64[ns tz] | ndarray[M8ns] | DatetimeArray | DatetimeArray | + td64[ns] | ndarray[m8ns] | TimedeltaArray| ndarray[m8ns] | + Period | ndarray[obj] | PeriodArray | PeriodArray | + Nullable | EA | EA | EA | """ - return self._data.internal_values() + return self._mgr.internal_values() @Appender(base.IndexOpsMixin.array.__doc__) # type: ignore @property def array(self) -> ExtensionArray: - return self._data._block.array_values() - - def _internal_get_values(self): - """ - Same as values (but handles sparseness conversions); is a view. - - Returns - ------- - numpy.ndarray - Data of the Series. - """ - return self._data.get_values() + return self._mgr._block.array_values() # ops def ravel(self, order="C"): @@ -552,7 +594,7 @@ def ravel(self, order="C"): See Also -------- - numpy.ndarray.ravel + numpy.ndarray.ravel : Return a flattened array. """ return self._values.ravel(order=order) @@ -560,7 +602,7 @@ def __len__(self) -> int: """ Return the length of the Series. """ - return len(self._data) + return len(self._mgr) def view(self, dtype=None) -> "Series": """ @@ -630,7 +672,7 @@ def view(self, dtype=None) -> "Series": """ return self._constructor( self._values.view(dtype), index=self.index - ).__finalize__(self) + ).__finalize__(self, method="view") # ---------------------------------------------------------------------- # NDArray Compat @@ -689,11 +731,7 @@ def __array_ufunc__( inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs) result = getattr(ufunc, method)(*inputs, **kwargs) - name: Optional[Hashable] - if len(set(names)) == 1: - name = names[0] - else: - name = None + name = names[0] if len(set(names)) == 1 else None def construct_return(result): if lib.is_scalar(result): @@ -798,19 +836,10 @@ def take(self, indices, axis=0, is_copy=None, **kwargs) -> "Series": indices = ensure_platform_int(indices) new_index = self.index.take(indices) + new_values = self._values.take(indices) - if is_categorical_dtype(self): - # https://github.com/pandas-dev/pandas/issues/20664 - # TODO: remove when the default Categorical.take behavior changes - indices = maybe_convert_indices(indices, len(self._get_axis(axis))) - kwargs = {"allow_fill": False} - else: - kwargs = {} - new_values = self._values.take(indices, **kwargs) - - return self._constructor( - new_values, index=new_index, fastpath=True - ).__finalize__(self) + result = self._constructor(new_values, index=new_index, fastpath=True) + return result.__finalize__(self, method="take") def _take_with_is_copy(self, indices, axis=0): """ @@ -849,43 +878,45 @@ def __getitem__(self, key): return self key_is_scalar = is_scalar(key) - if key_is_scalar: - key = self.index._convert_scalar_indexer(key, kind="getitem") - elif isinstance(key, (list, tuple)): + if isinstance(key, (list, tuple)): key = unpack_1tuple(key) - if key_is_scalar or isinstance(self.index, MultiIndex): + if is_integer(key) and self.index._should_fallback_to_positional(): + return self._values[key] + + elif key_is_scalar: + return self._get_value(key) + + if ( + isinstance(key, tuple) + and is_hashable(key) + and isinstance(self.index, MultiIndex) + ): # Otherwise index.get_value will raise InvalidIndexError try: - result = self.index.get_value(self, key) + result = self._get_value(key) return result - except InvalidIndexError: - if not isinstance(self.index, MultiIndex): - raise - except (KeyError, ValueError): - if isinstance(key, tuple) and isinstance(self.index, MultiIndex): - # kludge - pass - else: - raise + except KeyError: + # We still have the corner case where this tuple is a key + # in the first level of our MultiIndex + return self._get_values_tuple(key) - if not key_is_scalar: - # avoid expensive checks if we know we have a scalar - if is_iterator(key): - key = list(key) + if is_iterator(key): + key = list(key) - if com.is_bool_indexer(key): - key = check_bool_indexer(self.index, key) - return self._get_values(key) + if com.is_bool_indexer(key): + key = check_bool_indexer(self.index, key) + key = np.asarray(key, dtype=bool) + return self._get_values(key) return self._get_with(key) def _get_with(self, key): # other: fancy integer or otherwise if isinstance(key, slice): - # _convert_slice_indexer to determing if this slice is positional + # _convert_slice_indexer to determin if this slice is positional # or label based, and if the latter, convert to positional slobj = self.index._convert_slice_indexer(key, kind="getitem") return self._slice(slobj) @@ -897,6 +928,10 @@ def _get_with(self, key): elif isinstance(key, tuple): return self._get_values_tuple(key) + elif not is_list_like(key): + # e.g. scalars that aren't recognized by lib.is_scalar, GH#32684 + return self.loc[key] + if not isinstance(key, (list, np.ndarray, ExtensionArray, Series, Index)): key = list(key) @@ -940,14 +975,12 @@ def _get_values_tuple(self, key): # If key is contained, would have returned by now indexer, new_index = self.index.get_loc_level(key) return self._constructor(self._values[indexer], index=new_index).__finalize__( - self + self, ) def _get_values(self, indexer): try: - return self._constructor( - self._data.get_slice(indexer), fastpath=True - ).__finalize__(self) + return self._constructor(self._mgr.get_slice(indexer)).__finalize__(self,) except ValueError: # mpl compat if we look up e.g. ser[:, np.newaxis]; # see tests.series.timeseries.test_mpl_compat_hack @@ -971,42 +1004,41 @@ def _get_value(self, label, takeable: bool = False): # Similar to Index.get_value, but we do not fall back to positional loc = self.index.get_loc(label) - # We assume that _convert_scalar_indexer has already been called, - # with kind="loc", if necessary, by the time we get here return self.index._get_values_for_loc(self, loc, label) def __setitem__(self, key, value): key = com.apply_if_callable(key, self) cacher_needs_updating = self._check_is_chained_assignment_possible() + if key is Ellipsis: + key = slice(None) + try: self._set_with_engine(key, value) except (KeyError, ValueError): values = self._values if is_integer(key) and not self.index.inferred_type == "integer": + # positional setter values[key] = value - elif key is Ellipsis: - self[:] = value else: + # GH#12862 adding an new key to the Series self.loc[key] = value except TypeError as e: if isinstance(key, tuple) and not isinstance(self.index, MultiIndex): - raise ValueError("Can only tuple-index with a MultiIndex") - - # python 3 type errors should be raised - if _is_unorderable_exception(e): - raise IndexError(key) + raise ValueError("Can only tuple-index with a MultiIndex") from e if com.is_bool_indexer(key): key = check_bool_indexer(self.index, key) + key = np.asarray(key, dtype=bool) try: self._where(~key, value, inplace=True) - return except InvalidIndexError: - pass + self._set_values(key.astype(np.bool_), value) + return - self._set_with(key, value) + else: + self._set_with(key, value) if cacher_needs_updating: self._maybe_update_cacher() @@ -1023,20 +1055,8 @@ def _set_with(self, key, value): indexer = self.index._convert_slice_indexer(key, kind="getitem") return self._set_values(indexer, value) - elif is_scalar(key) and not is_integer(key) and key not in self.index: - # GH#12862 adding an new key to the Series - # Note: have to exclude integers because that is ambiguously - # position-based - self.loc[key] = value - return - else: - if isinstance(key, tuple): - try: - # TODO: no test cases that get here - self._set_values(key, value) - except Exception: - pass + assert not isinstance(key, tuple) if is_scalar(key): key = [key] @@ -1047,13 +1067,13 @@ def _set_with(self, key, value): else: key_type = lib.infer_dtype(key, skipna=False) + # Note: key_type == "boolean" should not occur because that + # should be caught by the is_bool_indexer check in __setitem__ if key_type == "integer": if self.index.inferred_type == "integer": self._set_labels(key, value) else: - return self._set_values(key, value) - elif key_type == "boolean": - self._set_values(key.astype(np.bool_), value) + self._set_values(key, value) else: self._set_labels(key, value) @@ -1068,7 +1088,7 @@ def _set_labels(self, key, value): def _set_values(self, key, value): if isinstance(key, Series): key = key._values - self._data = self._data.setitem(indexer=key, value=value) + self._mgr = self._mgr.setitem(indexer=key, value=value) self._maybe_update_cacher() def _set_value(self, label, value, takeable: bool = False): @@ -1160,7 +1180,9 @@ def repeat(self, repeats, axis=None) -> "Series": nv.validate_repeat(tuple(), dict(axis=axis)) new_index = self.index.repeat(repeats) new_values = self._values.repeat(repeats) - return self._constructor(new_values, index=new_index).__finalize__(self) + return self._constructor(new_values, index=new_index).__finalize__( + self, method="repeat" + ) def reset_index(self, level=None, drop=False, name=None, inplace=False): """ @@ -1287,7 +1309,7 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): else: return self._constructor( self._values.copy(), index=new_index - ).__finalize__(self) + ).__finalize__(self, method="reset_index") elif inplace: raise TypeError( "Cannot reset_index inplace on a Series to create a DataFrame" @@ -1488,8 +1510,6 @@ def to_dict(self, into=dict): instance of the mapping type you want. If you want a collections.defaultdict, you must pass it initialized. - .. versionadded:: 0.21.0 - Returns ------- collections.abc.Mapping @@ -1662,6 +1682,10 @@ def count(self, level=None): int or Series (if level specified) Number of non-null values in the Series. + See Also + -------- + DataFrame.count : Count non-NA cells for each column or row. + Examples -------- >>> s = pd.Series([0.0, 1.0, np.nan]) @@ -1682,9 +1706,11 @@ def count(self, level=None): level_codes[mask] = cnt = len(lev) lev = lev.insert(cnt, lev._na_value) - obs = level_codes[notna(self.values)] + obs = level_codes[notna(self._values)] out = np.bincount(obs, minlength=len(lev) or None) - return self._constructor(out, index=lev, dtype="int64").__finalize__(self) + return self._constructor(out, index=lev, dtype="int64").__finalize__( + self, method="count" + ) def mode(self, dropna=True) -> "Series": """ @@ -1771,7 +1797,7 @@ def unique(self): result = super().unique() return result - def drop_duplicates(self, keep="first", inplace=False) -> "Series": + def drop_duplicates(self, keep="first", inplace=False) -> Optional["Series"]: """ Return Series with duplicate values removed. @@ -1846,7 +1872,13 @@ def drop_duplicates(self, keep="first", inplace=False) -> "Series": 5 hippo Name: animal, dtype: object """ - return super().drop_duplicates(keep=keep, inplace=inplace) + inplace = validate_bool_kwarg(inplace, "inplace") + result = super().drop_duplicates(keep=keep) + if inplace: + self._update_inplace(result) + return None + else: + return result def duplicated(self, keep="first") -> "Series": """ @@ -1991,7 +2023,7 @@ def idxmin(self, axis=0, skipna=True, *args, **kwargs): nan """ skipna = nv.validate_argmin_with_skipna(skipna, args, kwargs) - i = nanops.nanargmin(com.values_from_object(self), skipna=skipna) + i = nanops.nanargmin(self._values, skipna=skipna) if i == -1: return np.nan return self.index[i] @@ -2062,7 +2094,7 @@ def idxmax(self, axis=0, skipna=True, *args, **kwargs): nan """ skipna = nv.validate_argmax_with_skipna(skipna, args, kwargs) - i = nanops.nanargmax(com.values_from_object(self), skipna=skipna) + i = nanops.nanargmax(self._values, skipna=skipna) if i == -1: return np.nan return self.index[i] @@ -2076,6 +2108,9 @@ def round(self, decimals=0, *args, **kwargs) -> "Series": decimals : int, default 0 Number of decimal places to round to. If decimals is negative, it specifies the number of positions to the left of the decimal point. + *args, **kwargs + Additional arguments and keywords have no effect but might be + accepted for compatibility with NumPy. Returns ------- @@ -2097,8 +2132,10 @@ def round(self, decimals=0, *args, **kwargs) -> "Series": dtype: float64 """ nv.validate_round(args, kwargs) - result = com.values_from_object(self).round(decimals) - result = self._constructor(result, index=self.index).__finalize__(self) + result = self._values.round(decimals) + result = self._constructor(result, index=self.index).__finalize__( + self, method="round" + ) return result @@ -2130,8 +2167,8 @@ def quantile(self, q=0.5, interpolation="linear"): See Also -------- - core.window.Rolling.quantile - numpy.percentile + core.window.Rolling.quantile : Calculate the rolling quantile. + numpy.percentile : Returns the q-th percentile(s) of the array elements. Examples -------- @@ -2189,6 +2226,12 @@ def corr(self, other, method="pearson", min_periods=None) -> float: float Correlation with other. + See Also + -------- + DataFrame.corr : Compute pairwise correlation between columns. + DataFrame.corrwith : Compute pairwise correlation with another + DataFrame or Series. + Examples -------- >>> def histogram_intersection(a, b): @@ -2231,6 +2274,10 @@ def cov(self, other, min_periods=None) -> float: Covariance between Series and other normalized by N-1 (unbiased estimator). + See Also + -------- + DataFrame.cov : Compute pairwise covariance of columns. + Examples -------- >>> s1 = pd.Series([0.90010907, 0.13484424, 0.62036035]) @@ -2243,7 +2290,7 @@ def cov(self, other, min_periods=None) -> float: return np.nan return nanops.nancov(this.values, other.values, min_periods=min_periods) - def diff(self, periods=1) -> "Series": + def diff(self, periods: int = 1) -> "Series": """ First discrete difference of element. @@ -2310,7 +2357,9 @@ def diff(self, periods=1) -> "Series": dtype: float64 """ result = algorithms.diff(self.array, periods) - return self._constructor(result, index=self.index).__finalize__(self) + return self._constructor(result, index=self.index).__finalize__( + self, method="diff" + ) def autocorr(self, lag=1) -> float: """ @@ -2427,7 +2476,7 @@ def dot(self, other): if isinstance(other, ABCDataFrame): return self._constructor( np.dot(lvals, rvals), index=other.columns - ).__finalize__(self) + ).__finalize__(self, method="dot") elif isinstance(other, Series): return np.dot(lvals, rvals) elif isinstance(rvals, np.ndarray): @@ -2447,8 +2496,7 @@ def __rmatmul__(self, other): """ return self.dot(np.transpose(other)) - @Substitution(klass="Series") - @Appender(base._shared_docs["searchsorted"]) + @doc(base.IndexOpsMixin.searchsorted, klass="Series") def searchsorted(self, value, side="left", sorter=None): return algorithms.searchsorted(self._values, value, side=side, sorter=sorter) @@ -2532,6 +2580,12 @@ def append(self, to_append, ignore_index=False, verify_integrity=False): to_concat.extend(to_append) else: to_concat = [self, to_append] + if any(isinstance(x, (ABCDataFrame,)) for x in to_concat[1:]): + msg = ( + f"to_append should be a Series or list/tuple of Series, " + f"got DataFrame" + ) + raise TypeError(msg) return concat( to_concat, ignore_index=ignore_index, verify_integrity=verify_integrity ) @@ -2558,12 +2612,10 @@ def _binop(self, other, func, level=None, fill_value=None): if not isinstance(other, Series): raise AssertionError("Other operand must be Series") - new_index = self.index this = self if not self.index.equals(other.index): this, other = self.align(other, level=level, join="outer", copy=False) - new_index = this.index this_vals, other_vals = ops.fill_binop(this.values, other.values, fill_value) @@ -2571,9 +2623,46 @@ def _binop(self, other, func, level=None, fill_value=None): result = func(this_vals, other_vals) name = ops.get_op_result_name(self, other) - ret = ops._construct_result(self, result, new_index, name) + ret = this._construct_result(result, name) return ret + def _construct_result( + self, result: Union[ArrayLike, Tuple[ArrayLike, ArrayLike]], name: Label + ) -> Union["Series", Tuple["Series", "Series"]]: + """ + Construct an appropriately-labelled Series from the result of an op. + + Parameters + ---------- + result : ndarray or ExtensionArray + name : Label + + Returns + ------- + Series + In the case of __divmod__ or __rdivmod__, a 2-tuple of Series. + """ + if isinstance(result, tuple): + # produced by divmod or rdivmod + + res1 = self._construct_result(result[0], name=name) + res2 = self._construct_result(result[1], name=name) + + # GH#33427 assertions to keep mypy happy + assert isinstance(res1, Series) + assert isinstance(res2, Series) + return (res1, res2) + + # We do not pass dtype to ensure that the Series constructor + # does inference in the case where `result` has object-dtype. + out = self._constructor(result, index=self.index) + out = out.__finalize__(self) + + # Set the result's name after __finalize__ is called because __finalize__ + # would set it back to self.name + out.name = name + return out + def combine(self, other, func, fill_value=None) -> "Series": """ Combine the Series with a Series or scalar according to `func`. @@ -2663,12 +2752,13 @@ def combine(self, other, func, fill_value=None) -> "Series": new_values = [func(lv, other) for lv in self._values] new_name = self.name - if is_categorical_dtype(self.values): + if is_categorical_dtype(self.dtype): pass - elif is_extension_array_dtype(self.values): + elif is_extension_array_dtype(self.dtype): + # TODO: can we do this for only SparseDtype? # The function can return something of any type, so check # if the type is compatible with the calling EA. - new_values = try_cast_to_ea(self._values, new_values) + new_values = maybe_cast_to_extension_array(type(self._values), new_values) return self._constructor(new_values, index=new_index, name=new_name) def combine_first(self, other) -> "Series": @@ -2760,7 +2850,7 @@ def update(self, other) -> None: other = other.reindex_like(self) mask = notna(other) - self._data = self._data.putmask(mask=mask, new=other, inplace=True) + self._mgr = self._mgr.putmask(mask=mask, new=other) self._maybe_update_cacher() # ---------------------------------------------------------------------- @@ -2946,7 +3036,7 @@ def _try_kind_sort(arr): if inplace: self._update_inplace(result) else: - return result.__finalize__(self) + return result.__finalize__(self, method="sort_values") def sort_index( self, @@ -3124,7 +3214,7 @@ def sort_index( if inplace: self._update_inplace(result) else: - return result.__finalize__(self) + return result.__finalize__(self, method="sort_index") def argsort(self, axis=0, kind="quicksort", order=None) -> "Series": """ @@ -3149,7 +3239,7 @@ def argsort(self, axis=0, kind="quicksort", order=None) -> "Series": See Also -------- - numpy.ndarray.argsort + numpy.ndarray.argsort : Returns the indices that would sort this array. """ values = self._values mask = isna(values) @@ -3158,11 +3248,13 @@ def argsort(self, axis=0, kind="quicksort", order=None) -> "Series": result = Series(-1, index=self.index, name=self.name, dtype="int64") notmask = ~mask result[notmask] = np.argsort(values[notmask], kind=kind) - return self._constructor(result, index=self.index).__finalize__(self) + return self._constructor(result, index=self.index).__finalize__( + self, method="argsort" + ) else: return self._constructor( np.argsort(values, kind=kind), index=self.index, dtype="int64" - ).__finalize__(self) + ).__finalize__(self, method="argsort") def nlargest(self, n=5, keep="first") -> "Series": """ @@ -3380,7 +3472,7 @@ def swaplevel(self, i=-2, j=-1, copy=True) -> "Series": assert isinstance(self.index, ABCMultiIndex) new_index = self.index.swaplevel(i, j) return self._constructor(self._values, index=new_index, copy=copy).__finalize__( - self + self, method="swaplevel" ) def reorder_levels(self, order) -> "Series": @@ -3584,7 +3676,9 @@ def map(self, arg, na_action=None) -> "Series": dtype: object """ new_values = super()._map_values(arg, na_action=na_action) - return self._constructor(new_values, index=self.index).__finalize__(self) + return self._constructor(new_values, index=self.index).__finalize__( + self, method="map" + ) def _gotitem(self, key, ndim, subset=None) -> "Series": """ @@ -3771,7 +3865,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): """ if len(self) == 0: return self._constructor(dtype=self.dtype, index=self.index).__finalize__( - self + self, method="apply" ) # dispatch to agg @@ -3800,7 +3894,7 @@ def f(x): # GH#23179 some EAs do not have `map` mapped = self._values.map(f) else: - values = self.astype(object).values + values = self.astype(object)._values mapped = lib.map_infer(values, f, convert=convert_dtype) if len(mapped) and isinstance(mapped[0], Series): @@ -3808,7 +3902,9 @@ def f(x): # so extension arrays can be used return self._constructor_expanddim(pd.array(mapped), index=self.index) else: - return self._constructor(mapped, index=self.index).__finalize__(self) + return self._constructor(mapped, index=self.index).__finalize__( + self, method="apply" + ) def _reduce( self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds @@ -3855,7 +3951,7 @@ def _needs_reindex_multi(self, axes, method, level): """ return False - @Appender(generic._shared_docs["align"] % _shared_doc_kwargs) + @doc(NDFrame.align, **_shared_doc_kwargs) def align( self, other, @@ -3906,7 +4002,7 @@ def rename( Parameters ---------- axis : {0 or "index"} - Unused. Accepted for compatability with DataFrame method only. + Unused. Accepted for compatibility with DataFrame method only. index : scalar, hashable sequence, dict-like or function, optional Functions or dict-like are transformations to apply to the index. @@ -3983,7 +4079,7 @@ def rename( see_also_sub="", ) @Appender(generic.NDFrame.set_axis.__doc__) - def set_axis(self, labels, axis=0, inplace=False): + def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): return super().set_axis(labels, axis=axis, inplace=inplace) @Substitution(**_shared_doc_kwargs) @@ -4017,12 +4113,8 @@ def drop( index : single label or list-like Redundant for application on Series, but 'index' can be used instead of 'labels'. - - .. versionadded:: 0.21.0 columns : single label or list-like No change is made to the Series; use 'index' or 'labels' instead. - - .. versionadded:: 0.21.0 level : int or level name, optional For MultiIndex, level for which the labels will be removed. inplace : bool, default False @@ -4120,7 +4212,7 @@ def fillna( downcast=downcast, ) - @Appender(generic._shared_docs["replace"] % _shared_doc_kwargs) + @doc(NDFrame.replace, **_shared_doc_kwargs) def replace( self, to_replace=None, @@ -4139,7 +4231,7 @@ def replace( method=method, ) - @Appender(generic._shared_docs["shift"] % _shared_doc_kwargs) + @doc(NDFrame.shift, **_shared_doc_kwargs) def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "Series": return super().shift( periods=periods, freq=freq, axis=axis, fill_value=fill_value @@ -4253,7 +4345,9 @@ def isin(self, values) -> "Series": Name: animal, dtype: bool """ result = algorithms.isin(self, values) - return self._constructor(result, index=self.index).__finalize__(self) + return self._constructor(result, index=self.index).__finalize__( + self, method="isin" + ) def between(self, left, right, inclusive=True) -> "Series": """ @@ -4489,7 +4583,9 @@ def to_timestamp(self, freq=None, how="start", copy=True) -> "Series": assert isinstance(self.index, (ABCDatetimeIndex, ABCPeriodIndex)) new_index = self.index.to_timestamp(freq=freq, how=how) - return self._constructor(new_values, index=new_index).__finalize__(self) + return self._constructor(new_values, index=new_index).__finalize__( + self, method="to_timestamp" + ) def to_period(self, freq=None, copy=True) -> "Series": """ @@ -4514,7 +4610,9 @@ def to_period(self, freq=None, copy=True) -> "Series": assert isinstance(self.index, ABCDatetimeIndex) new_index = self.index.to_period(freq=freq) - return self._constructor(new_values, index=new_index).__finalize__(self) + return self._constructor(new_values, index=new_index).__finalize__( + self, method="to_period" + ) # ---------------------------------------------------------------------- # Add index diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 4b0fc3e47356c..76b851d8ac923 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2,7 +2,7 @@ from functools import wraps import re import textwrap -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Type, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Pattern, Type, Union import warnings import numpy as np @@ -10,7 +10,7 @@ import pandas._libs.lib as lib import pandas._libs.missing as libmissing import pandas._libs.ops as libops -from pandas._typing import ArrayLike, Dtype +from pandas._typing import ArrayLike, Dtype, Scalar from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -36,7 +36,6 @@ from pandas.core.algorithms import take_1d from pandas.core.base import NoNewAttributesMixin -import pandas.core.common as com from pandas.core.construction import extract_array if TYPE_CHECKING: @@ -206,7 +205,7 @@ def _map_object(f, arr, na_mask=False, na_value=np.nan, dtype=object): return np.ndarray(0, dtype=dtype) if isinstance(arr, ABCSeries): - arr = arr.values + arr = arr._values # TODO: extract_array? if not isinstance(arr, np.ndarray): arr = np.asarray(arr, dtype=object) if na_mask: @@ -447,7 +446,7 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): stacklevel=3, ) - f = lambda x: bool(regex.search(x)) + f = lambda x: regex.search(x) is not None else: if case: f = lambda x: pat in x @@ -573,7 +572,7 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): r""" Replace occurrences of pattern/regex in the Series/Index with some other string. Equivalent to :meth:`str.replace` or - :func:`re.sub`. + :func:`re.sub`, depending on the regex value. Parameters ---------- @@ -653,9 +652,9 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): To get the idea: >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr) - 0 <_sre.SRE_Match object; span=(0, 1), match='f'>oo - 1 <_sre.SRE_Match object; span=(0, 1), match='f'>uz - 2 NaN + 0 oo + 1 uz + 2 NaN dtype: object Reverse every lowercase alphabetic word: @@ -776,19 +775,27 @@ def scalar_rep(x): else: def rep(x, r): + if x is libmissing.NA: + return x try: return bytes.__mul__(x, r) except TypeError: return str.__mul__(x, r) repeats = np.asarray(repeats, dtype=object) - result = libops.vec_binop(com.values_from_object(arr), repeats, rep) + result = libops.vec_binop(np.asarray(arr), repeats, rep) return result -def str_match(arr, pat, case=True, flags=0, na=np.nan): +def str_match( + arr: ArrayLike, + pat: Union[str, Pattern], + case: bool = True, + flags: int = 0, + na: Scalar = np.nan, +): """ - Determine if each string matches a regular expression. + Determine if each string starts with a match of a regular expression. Parameters ---------- @@ -807,6 +814,7 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan): See Also -------- + fullmatch : Stricter matching that requires the entire string to match. contains : Analogous, but less strict, relying on re.search instead of re.match. extract : Extract matched groups. @@ -817,7 +825,51 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan): regex = re.compile(pat, flags=flags) dtype = bool - f = lambda x: bool(regex.match(x)) + f = lambda x: regex.match(x) is not None + + return _na_map(f, arr, na, dtype=dtype) + + +def str_fullmatch( + arr: ArrayLike, + pat: Union[str, Pattern], + case: bool = True, + flags: int = 0, + na: Scalar = np.nan, +): + """ + Determine if each string entirely matches a regular expression. + + .. versionadded:: 1.1.0 + + Parameters + ---------- + pat : str + Character sequence or regular expression. + case : bool, default True + If True, case sensitive. + flags : int, default 0 (no flags) + Regex module flags, e.g. re.IGNORECASE. + na : default NaN + Fill value for missing values. + + Returns + ------- + Series/array of boolean values + + See Also + -------- + match : Similar, but also returns `True` when only a *prefix* of the string + matches the regular expression. + extract : Extract matched groups. + """ + if not case: + flags |= re.IGNORECASE + + regex = re.compile(pat, flags=flags) + + dtype = bool + f = lambda x: regex.fullmatch(x) is not None return _na_map(f, arr, na, dtype=dtype) @@ -2024,8 +2076,18 @@ class StringMethods(NoNewAttributesMixin): Examples -------- - >>> s.str.split('_') - >>> s.str.replace('_', '') + >>> s = pd.Series(["A_Str_Series"]) + >>> s + 0 A_Str_Series + dtype: object + + >>> s.str.split("_") + 0 [A, Str, Series] + dtype: object + + >>> s.str.replace("_", "") + 0 AStrSeries + dtype: object """ def __init__(self, data): @@ -2033,8 +2095,8 @@ def __init__(self, data): self._is_categorical = is_categorical_dtype(data) self._is_string = data.dtype.name == "string" - # .values.categories works for both Series/Index - self._parent = data.values.categories if self._is_categorical else data + # ._values.categories works for both Series/Index + self._parent = data._values.categories if self._is_categorical else data # save orig to blow up categoricals to the right type self._orig = data self._freeze() @@ -2235,7 +2297,7 @@ def _get_series_list(self, others): if isinstance(others, ABCSeries): return [others] elif isinstance(others, ABCIndexClass): - return [Series(others.values, index=others)] + return [Series(others._values, index=idx)] elif isinstance(others, ABCDataFrame): return [others[x] for x in others] elif isinstance(others, np.ndarray) and others.ndim == 2: @@ -2425,12 +2487,12 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): try: # turn anything in "others" into lists of Series others = self._get_series_list(others) - except ValueError: # do not catch TypeError raised by _get_series_list + except ValueError as err: # do not catch TypeError raised by _get_series_list raise ValueError( "If `others` contains arrays or lists (or other " "list-likes without an index), these must all be " "of the same length as the calling Series/Index." - ) + ) from err # align if required if any(not data.index.equals(x.index) for x in others): @@ -2497,7 +2559,7 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): Limit number of splits in output. ``None``, 0 and -1 will be interpreted as return all splits. expand : bool, default False - Expand the splitted strings into separate columns. + Expand the split strings into separate columns. * If ``True``, return DataFrame/MultiIndex expanding dimensionality. * If ``False``, return Series/Index, containing lists of strings. @@ -2531,9 +2593,14 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): Examples -------- - >>> s = pd.Series(["this is a regular sentence", - ... "https://docs.python.org/3/tutorial/index.html", - ... np.nan]) + >>> s = pd.Series( + ... [ + ... "this is a regular sentence", + ... "https://docs.python.org/3/tutorial/index.html", + ... np.nan + ... ] + ... ) + >>> s 0 this is a regular sentence 1 https://docs.python.org/3/tutorial/index.html 2 NaN @@ -2573,7 +2640,7 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): The `pat` parameter can be used to split by other characters. - >>> s.str.split(pat = "/") + >>> s.str.split(pat="/") 0 [this is a regular sentence] 1 [https:, , docs.python.org, 3, tutorial, index... 2 NaN @@ -2584,14 +2651,10 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): the columns during the split. >>> s.str.split(expand=True) - 0 1 2 3 - 0 this is a regular - 1 https://docs.python.org/3/tutorial/index.html None None None - 2 NaN NaN NaN NaN \ - 4 - 0 sentence - 1 None - 2 NaN + 0 1 2 3 4 + 0 this is a regular sentence + 1 https://docs.python.org/3/tutorial/index.html None None None None + 2 NaN NaN NaN NaN NaN For slightly more complex use cases like splitting the html document name from a url, a combination of parameter settings can be used. @@ -2606,7 +2669,9 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): expressions. >>> s = pd.Series(["1+1=2"]) - + >>> s + 0 1+1=2 + dtype: object >>> s.str.split(r"\+|=", expand=True) 0 1 2 0 1 1 2 @@ -2698,7 +2763,7 @@ def rsplit(self, pat=None, n=-1, expand=False): >>> idx.str.partition() MultiIndex([('X', ' ', '123'), ('Y', ' ', '999')], - dtype='object') + ) Or an index with tuples with ``expand=False``: @@ -2761,6 +2826,12 @@ def match(self, pat, case=True, flags=0, na=np.nan): result = str_match(self._parent, pat, case=case, flags=flags, na=na) return self._wrap_result(result, fill_value=na, returns_string=False) + @copy(str_fullmatch) + @forbid_nonstring_types(["bytes"]) + def fullmatch(self, pat, case=True, flags=0, na=np.nan): + result = str_fullmatch(self._parent, pat, case=case, flags=flags, na=na) + return self._wrap_result(result, fill_value=na, returns_string=False) + @copy(str_replace) @forbid_nonstring_types(["bytes"]) def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index b10b736b9134e..207c5cc98449a 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -229,13 +229,12 @@ def _return_parsed_timezone_results(result, timezones, tz, name): ------- tz_result : Index-like of parsed dates with timezone """ - if tz is not None: - raise ValueError( - "Cannot pass a tz argument when parsing strings with timezone information." - ) tz_results = np.array( [Timestamp(res).tz_localize(zone) for res, zone in zip(result, timezones)] ) + if tz is not None: + # Convert to the same tz + tz_results = np.array([tz_result.tz_convert(tz) for tz_result in tz_results]) from pandas import Index return Index(tz_results, name=name) @@ -260,7 +259,7 @@ def _convert_listlike_datetimes( Parameters ---------- arg : list, tuple, ndarray, Series, Index - date to be parced + date to be parsed name : object None or string for the Index name tz : object @@ -323,15 +322,13 @@ def _convert_listlike_datetimes( # GH 30050 pass an ndarray to tslib.array_with_unit_to_datetime # because it expects an ndarray argument if isinstance(arg, IntegerArray): - # Explicitly pass NaT mask to array_with_unit_to_datetime - mask = arg.isna() - arg = arg._ndarray_values + result = arg.astype(f"datetime64[{unit}]") + tz_parsed = None else: - mask = None - result, tz_parsed = tslib.array_with_unit_to_datetime( - arg, mask, unit, errors=errors - ) + result, tz_parsed = tslib.array_with_unit_to_datetime( + arg, unit, errors=errors + ) if errors == "ignore": from pandas import Index @@ -361,7 +358,18 @@ def _convert_listlike_datetimes( # warn if passing timedelta64, raise for PeriodDtype # NB: this must come after unit transformation orig_arg = arg - arg, _ = maybe_convert_dtype(arg, copy=False) + try: + arg, _ = maybe_convert_dtype(arg, copy=False) + except TypeError: + if errors == "coerce": + result = np.array(["NaT"], dtype="datetime64[ns]").repeat(len(arg)) + return DatetimeIndex(result, name=name) + elif errors == "ignore": + from pandas import Index + + result = Index(arg, name=name) + return result + raise arg = ensure_object(arg) require_iso8601 = False @@ -391,8 +399,10 @@ def _convert_listlike_datetimes( # datetime64[ns] orig_arg = ensure_object(orig_arg) result = _attempt_YYYYMMDD(orig_arg, errors=errors) - except (ValueError, TypeError, tslibs.OutOfBoundsDatetime): - raise ValueError("cannot convert the input to '%Y%m%d' date format") + except (ValueError, TypeError, tslibs.OutOfBoundsDatetime) as err: + raise ValueError( + "cannot convert the input to '%Y%m%d' date format" + ) from err # fallback if result is None: @@ -484,8 +494,10 @@ def _adjust_to_origin(arg, origin, unit): raise ValueError("unit must be 'D' for origin='julian'") try: arg = arg - j0 - except TypeError: - raise ValueError("incompatible 'arg' type for given 'origin'='julian'") + except TypeError as err: + raise ValueError( + "incompatible 'arg' type for given 'origin'='julian'" + ) from err # preemptively check this for a nice range j_max = Timestamp.max.to_julian_date() - j0 @@ -508,10 +520,14 @@ def _adjust_to_origin(arg, origin, unit): # we are going to offset back to unix / epoch time try: offset = Timestamp(origin) - except tslibs.OutOfBoundsDatetime: - raise tslibs.OutOfBoundsDatetime(f"origin {origin} is Out of Bounds") - except ValueError: - raise ValueError(f"origin {origin} cannot be converted to a Timestamp") + except tslibs.OutOfBoundsDatetime as err: + raise tslibs.OutOfBoundsDatetime( + f"origin {origin} is Out of Bounds" + ) from err + except ValueError as err: + raise ValueError( + f"origin {origin} cannot be converted to a Timestamp" + ) from err if offset.tz is not None: raise ValueError(f"origin offset {offset} must be tz-naive") @@ -548,7 +564,7 @@ def to_datetime( Parameters ---------- - arg : int, float, str, datetime, list, tuple, 1-d array, Series DataFrame/dict-like + arg : int, float, str, datetime, list, tuple, 1-d array, Series, DataFrame/dict-like The object to convert to a datetime. errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. @@ -590,9 +606,9 @@ def to_datetime( would calculate the number of milliseconds to the unix epoch start. infer_datetime_format : bool, default False If True and no `format` is given, attempt to infer the format of the - datetime strings, and if it can be inferred, switch to a faster - method of parsing them. In some cases this can increase the parsing - speed by ~5-10x. + datetime strings based on the first non-NaN element, + and if it can be inferred, switch to a faster method of parsing them. + In some cases this can increase the parsing speed by ~5-10x. origin : scalar, default 'unix' Define the reference date. The numeric values would be parsed as number of units (defined by `unit`) since this reference date. @@ -861,7 +877,7 @@ def coerce(values): try: values = to_datetime(values, format="%Y%m%d", errors=errors, utc=tz) except (TypeError, ValueError) as err: - raise ValueError(f"cannot assemble the datetimes: {err}") + raise ValueError(f"cannot assemble the datetimes: {err}") from err for u in ["h", "m", "s", "ms", "us", "ns"]: value = unit_rev.get(u) @@ -869,7 +885,9 @@ def coerce(values): try: values += to_timedelta(coerce(arg[value]), unit=u, errors=errors) except (TypeError, ValueError) as err: - raise ValueError(f"cannot assemble the datetimes [{value}]: {err}") + raise ValueError( + f"cannot assemble the datetimes [{value}]: {err}" + ) from err return values @@ -1001,13 +1019,13 @@ def _convert_listlike(arg, format): for element in arg: try: times.append(datetime.strptime(element, format).time()) - except (ValueError, TypeError): + except (ValueError, TypeError) as err: if errors == "raise": msg = ( f"Cannot convert {element} to a time with given " f"format {format}" ) - raise ValueError(msg) + raise ValueError(msg) from err elif errors == "ignore": return arg else: diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 4939cbfc9cc96..f4eb16602f8a0 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -35,17 +35,18 @@ def to_numeric(arg, errors="raise", downcast=None): Parameters ---------- arg : scalar, list, tuple, 1-d array, or Series + Argument to be converted. errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. - If 'coerce', then invalid parsing will be set as NaN. - If 'ignore', then invalid parsing will return the input. - downcast : {'integer', 'signed', 'unsigned', 'float'}, default None + downcast : {'int', 'signed', 'unsigned', 'float'}, default None If not None, and if the data has been successfully cast to a numerical dtype (or if the data was numeric to begin with), downcast that resulting data to the smallest numerical dtype possible according to the following rules: - - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) + - 'int' or 'signed': smallest signed int dtype (min.: np.int8) - 'unsigned': smallest unsigned int dtype (min.: np.uint8) - 'float': smallest float dtype (min.: np.float32) @@ -61,7 +62,8 @@ def to_numeric(arg, errors="raise", downcast=None): Returns ------- - ret : numeric if parsing succeeded. + ret + Numeric if parsing succeeded. Return type depends on input. Series if Series, otherwise ndarray. See Also @@ -70,7 +72,7 @@ def to_numeric(arg, errors="raise", downcast=None): to_datetime : Convert argument to datetime. to_timedelta : Convert argument to timedelta. numpy.ndarray.astype : Cast a numpy array to a specified type. - convert_dtypes : Convert dtypes. + DataFrame.convert_dtypes : Convert dtypes. Examples -------- @@ -160,7 +162,7 @@ def to_numeric(arg, errors="raise", downcast=None): if downcast in ("integer", "signed"): typecodes = np.typecodes["Integer"] - elif downcast == "unsigned" and np.min(values) >= 0: + elif downcast == "unsigned" and (not len(values) or np.min(values) >= 0): typecodes = np.typecodes["UnsignedInteger"] elif downcast == "float": typecodes = np.typecodes["Float"] diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index d7529ec799022..48f30acf269da 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -58,19 +58,19 @@ def to_timedelta(arg, unit="ns", errors="raise"): >>> pd.to_timedelta('1 days 06:05:01.00003') Timedelta('1 days 06:05:01.000030') >>> pd.to_timedelta('15.5us') - Timedelta('0 days 00:00:00.000015') + Timedelta('0 days 00:00:00.000015500') Parsing a list or array of strings: >>> pd.to_timedelta(['1 days 06:05:01.00003', '15.5us', 'nan']) - TimedeltaIndex(['1 days 06:05:01.000030', '0 days 00:00:00.000015', NaT], + TimedeltaIndex(['1 days 06:05:01.000030', '0 days 00:00:00.000015500', NaT], dtype='timedelta64[ns]', freq=None) Converting numbers by specifying the `unit` keyword argument: >>> pd.to_timedelta(np.arange(5), unit='s') - TimedeltaIndex(['00:00:00', '00:00:01', '00:00:02', - '00:00:03', '00:00:04'], + TimedeltaIndex(['0 days 00:00:00', '0 days 00:00:01', '0 days 00:00:02', + '0 days 00:00:03', '0 days 00:00:04'], dtype='timedelta64[ns]', freq=None) >>> pd.to_timedelta(np.arange(5), unit='d') TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py new file mode 100644 index 0000000000000..e4debab2c22ee --- /dev/null +++ b/pandas/core/util/numba_.py @@ -0,0 +1,58 @@ +"""Common utilities for Numba operations""" +import types +from typing import Callable, Dict, Optional + +import numpy as np + +from pandas.compat._optional import import_optional_dependency + + +def check_kwargs_and_nopython( + kwargs: Optional[Dict] = None, nopython: Optional[bool] = None +): + if kwargs and nopython: + raise ValueError( + "numba does not support kwargs with nopython=True: " + "https://github.com/numba/numba/issues/2916" + ) + + +def get_jit_arguments(engine_kwargs: Optional[Dict[str, bool]] = None): + """ + Return arguments to pass to numba.JIT, falling back on pandas default JIT settings. + """ + if engine_kwargs is None: + engine_kwargs = {} + + nopython = engine_kwargs.get("nopython", True) + nogil = engine_kwargs.get("nogil", False) + parallel = engine_kwargs.get("parallel", False) + return nopython, nogil, parallel + + +def jit_user_function(func: Callable, nopython: bool, nogil: bool, parallel: bool): + """ + JIT the user's function given the configurable arguments. + """ + numba = import_optional_dependency("numba") + + if isinstance(func, numba.targets.registry.CPUDispatcher): + # Don't jit a user passed jitted function + numba_func = func + else: + + @numba.generated_jit(nopython=nopython, nogil=nogil, parallel=parallel) + def numba_func(data, *_args): + if getattr(np, func.__name__, False) is func or isinstance( + func, types.BuiltinFunctionType + ): + jf = func + else: + jf = numba.jit(func, nopython=nopython, nogil=nogil) + + def impl(data, *_args): + return jf(data, *_args) + + return impl + + return numba_func diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index ed0b816f64800..05f19de19f9f7 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -296,7 +296,7 @@ def zsqrt(x): mask = x < 0 if isinstance(x, ABCDataFrame): - if mask.values.any(): + if mask._values.any(): result[mask] = 0 else: if mask.any(): @@ -323,3 +323,13 @@ def func(arg, window, min_periods=None): return cfunc(arg, window, min_periods) return func + + +def validate_baseindexer_support(func_name: Optional[str]) -> None: + # GH 32865: These functions work correctly with a BaseIndexer subclass + BASEINDEXER_WHITELIST = {"min", "max", "mean", "sum", "median", "kurt", "quantile"} + if isinstance(func_name, str) and func_name not in BASEINDEXER_WHITELIST: + raise NotImplementedError( + f"{func_name} is not supported with using a BaseIndexer " + f"subclasses. You can use .apply() with {func_name}." + ) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index e045d1c2211d7..2759280dc1d1c 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -29,19 +29,25 @@ class EWM(_Rolling): r""" - Provide exponential weighted functions. + Provide exponential weighted (EW) functions. + + Available EW functions: ``mean()``, ``var()``, ``std()``, ``corr()``, ``cov()``. + + Exactly one parameter: ``com``, ``span``, ``halflife``, or ``alpha`` must be + provided. Parameters ---------- com : float, optional Specify decay in terms of center of mass, - :math:`\alpha = 1 / (1 + com),\text{ for } com \geq 0`. + :math:`\alpha = 1 / (1 + com)`, for :math:`com \geq 0`. span : float, optional Specify decay in terms of span, - :math:`\alpha = 2 / (span + 1),\text{ for } span \geq 1`. + :math:`\alpha = 2 / (span + 1)`, for :math:`span \geq 1`. halflife : float, optional Specify decay in terms of half-life, - :math:`\alpha = 1 - exp(log(0.5) / halflife),\text{for} halflife > 0`. + :math:`\alpha = 1 - \exp\left(-\ln(2) / halflife\right)`, for + :math:`halflife > 0`. alpha : float, optional Specify smoothing factor :math:`\alpha` directly, :math:`0 < \alpha \leq 1`. @@ -50,11 +56,39 @@ class EWM(_Rolling): (otherwise result is NA). adjust : bool, default True Divide by decaying adjustment factor in beginning periods to account - for imbalance in relative weightings - (viewing EWMA as a moving average). + for imbalance in relative weightings (viewing EWMA as a moving average). + + - When ``adjust=True`` (default), the EW function is calculated using weights + :math:`w_i = (1 - \alpha)^i`. For example, the EW moving average of the series + [:math:`x_0, x_1, ..., x_t`] would be: + + .. math:: + y_t = \frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ... + (1 - + \alpha)^t x_0}{1 + (1 - \alpha) + (1 - \alpha)^2 + ... + (1 - \alpha)^t} + + - When ``adjust=False``, the exponentially weighted function is calculated + recursively: + + .. math:: + \begin{split} + y_0 &= x_0\\ + y_t &= (1 - \alpha) y_{t-1} + \alpha x_t, + \end{split} ignore_na : bool, default False - Ignore missing values when calculating weights; - specify True to reproduce pre-0.15.0 behavior. + Ignore missing values when calculating weights; specify ``True`` to reproduce + pre-0.15.0 behavior. + + - When ``ignore_na=False`` (default), weights are based on absolute positions. + For example, the weights of :math:`x_0` and :math:`x_2` used in calculating + the final weighted average of [:math:`x_0`, None, :math:`x_2`] are + :math:`(1-\alpha)^2` and :math:`1` if ``adjust=True``, and + :math:`(1-\alpha)^2` and :math:`\alpha` if ``adjust=False``. + + - When ``ignore_na=True`` (reproducing pre-0.15.0 behavior), weights are based + on relative positions. For example, the weights of :math:`x_0` and :math:`x_2` + used in calculating the final weighted average of + [:math:`x_0`, None, :math:`x_2`] are :math:`1-\alpha` and :math:`1` if + ``adjust=True``, and :math:`1-\alpha` and :math:`\alpha` if ``adjust=False``. axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to use. The value 0 identifies the rows, and 1 identifies the columns. @@ -71,30 +105,9 @@ class EWM(_Rolling): Notes ----- - Exactly one of center of mass, span, half-life, and alpha must be provided. - Allowed values and relationship between the parameters are specified in the - parameter descriptions above; see the link at the end of this section for - a detailed explanation. - When adjust is True (default), weighted averages are calculated using - weights (1-alpha)**(n-1), (1-alpha)**(n-2), ..., 1-alpha, 1. - - When adjust is False, weighted averages are calculated recursively as: - weighted_average[0] = arg[0]; - weighted_average[i] = (1-alpha)*weighted_average[i-1] + alpha*arg[i]. - - When ignore_na is False (default), weights are based on absolute positions. - For example, the weights of x and y used in calculating the final weighted - average of [x, None, y] are (1-alpha)**2 and 1 (if adjust is True), and - (1-alpha)**2 and alpha (if adjust is False). - - When ignore_na is True (reproducing pre-0.15.0 behavior), weights are based - on relative positions. For example, the weights of x and y used in - calculating the final weighted average of [x, None, y] are 1-alpha and 1 - (if adjust is True), and 1-alpha and alpha (if adjust is False). - - More details can be found at - https://pandas.pydata.org/pandas-docs/stable/user_guide/computation.html#exponentially-weighted-windows + More details can be found at: + :ref:`Exponentially weighted windows `. Examples -------- @@ -154,33 +167,18 @@ def _constructor(self): """ Examples -------- - - >>> df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C']) + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) >>> df - A B C - 0 -2.385977 -0.102758 0.438822 - 1 -1.004295 0.905829 -0.954544 - 2 0.735167 -0.165272 -1.619346 - 3 -0.702657 -1.340923 -0.706334 - 4 -0.246845 0.211596 -0.901819 - 5 2.463718 3.157577 -1.380906 - 6 -1.142255 2.340594 -0.039875 - 7 1.396598 -1.647453 1.677227 - 8 -0.543425 1.761277 -0.220481 - 9 -0.640505 0.289374 -1.550670 + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 >>> df.ewm(alpha=0.5).mean() A B C - 0 -2.385977 -0.102758 0.438822 - 1 -1.464856 0.569633 -0.490089 - 2 -0.207700 0.149687 -1.135379 - 3 -0.471677 -0.645305 -0.906555 - 4 -0.355635 -0.203033 -0.904111 - 5 1.076417 1.503943 -1.146293 - 6 -0.041654 1.925562 -0.588728 - 7 0.680292 0.132049 0.548693 - 8 0.067236 0.948257 0.163353 - 9 -0.286980 0.618493 -0.694496 + 0 1.000000 4.000000 7.000000 + 1 1.666667 4.666667 7.666667 + 2 2.428571 5.428571 8.428571 """ ) @@ -219,13 +217,13 @@ def _apply(self, func, **kwargs): try: values = self._prep_values(b.values) - except (TypeError, NotImplementedError): + except (TypeError, NotImplementedError) as err: if isinstance(obj, ABCDataFrame): exclude.extend(b.columns) del block_list[i] continue else: - raise DataError("No numeric types to aggregate") + raise DataError("No numeric types to aggregate") from err if values.size == 0: results.append(values.copy()) diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 140e0144d0a2d..146c139806bca 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -37,7 +37,8 @@ class Expanding(_Rolling_and_Expanding): Examples -------- - >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + >>> df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}) + >>> df B 0 0.0 1 1.0 @@ -98,33 +99,18 @@ def _get_window(self, other=None, **kwargs): """ Examples -------- - - >>> df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C']) + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) >>> df - A B C - 0 -2.385977 -0.102758 0.438822 - 1 -1.004295 0.905829 -0.954544 - 2 0.735167 -0.165272 -1.619346 - 3 -0.702657 -1.340923 -0.706334 - 4 -0.246845 0.211596 -0.901819 - 5 2.463718 3.157577 -1.380906 - 6 -1.142255 2.340594 -0.039875 - 7 1.396598 -1.647453 1.677227 - 8 -0.543425 1.761277 -0.220481 - 9 -0.640505 0.289374 -1.550670 + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 >>> df.ewm(alpha=0.5).mean() A B C - 0 -2.385977 -0.102758 0.438822 - 1 -1.464856 0.569633 -0.490089 - 2 -0.207700 0.149687 -1.135379 - 3 -0.471677 -0.645305 -0.906555 - 4 -0.355635 -0.203033 -0.904111 - 5 1.076417 1.503943 -1.146293 - 6 -0.041654 1.925562 -0.588728 - 7 0.680292 0.132049 0.548693 - 8 0.067236 0.948257 0.163353 - 9 -0.286980 0.618493 -0.694496 + 0 1.000000 4.000000 7.000000 + 1 1.666667 4.666667 7.666667 + 2 2.428571 5.428571 8.428571 """ ) diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index 921cdb3c2523f..9a02c5231c151 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -120,3 +120,53 @@ def get_window_bounds( np.zeros(num_values, dtype=np.int64), np.arange(1, num_values + 1, dtype=np.int64), ) + + +class FixedForwardWindowIndexer(BaseIndexer): + """ + Creates window boundaries for fixed-length windows that include the + current row. + + Examples + -------- + >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + >>> df + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + >>> indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=2) + >>> df.rolling(window=indexer, min_periods=1).sum() + B + 0 1.0 + 1 3.0 + 2 2.0 + 3 4.0 + 4 4.0 + """ + + @Appender(get_window_bounds_doc) + def get_window_bounds( + self, + num_values: int = 0, + min_periods: Optional[int] = None, + center: Optional[bool] = None, + closed: Optional[str] = None, + ) -> Tuple[np.ndarray, np.ndarray]: + + if center: + raise ValueError("Forward-looking windows can't have center=True") + if closed is not None: + raise ValueError( + "Forward-looking windows don't support setting the closed argument" + ) + + start = np.arange(num_values, dtype="int64") + end_s = start[: -self.window_size] + self.window_size + end_e = np.full(self.window_size, num_values, dtype="int64") + end = np.concatenate([end_s, end_e]) + + return start, end diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index d6e8194c861fa..5d35ec7457ab0 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -1,4 +1,3 @@ -import types from typing import Any, Callable, Dict, Optional, Tuple import numpy as np @@ -6,35 +5,49 @@ from pandas._typing import Scalar from pandas.compat._optional import import_optional_dependency +from pandas.core.util.numba_ import ( + check_kwargs_and_nopython, + get_jit_arguments, + jit_user_function, +) -def make_rolling_apply( - func: Callable[..., Scalar], + +def generate_numba_apply_func( args: Tuple, - nogil: bool, - parallel: bool, - nopython: bool, + kwargs: Dict[str, Any], + func: Callable[..., Scalar], + engine_kwargs: Optional[Dict[str, bool]], ): """ - Creates a JITted rolling apply function with a JITted version of - the user's function. + Generate a numba jitted apply function specified by values from engine_kwargs. + + 1. jit the user's function + 2. Return a rolling apply function with the jitted function inline + + Configurations specified in engine_kwargs apply to both the user's + function _AND_ the rolling apply function. Parameters ---------- - func : function - function to be applied to each window and will be JITed args : tuple *args to be passed into the function - nogil : bool - nogil parameter from engine_kwargs for numba.jit - parallel : bool - parallel parameter from engine_kwargs for numba.jit - nopython : bool - nopython parameter from engine_kwargs for numba.jit + kwargs : dict + **kwargs to be passed into the function + func : function + function to be applied to each window and will be JITed + engine_kwargs : dict + dictionary of arguments to be passed into numba.jit Returns ------- Numba function """ + nopython, nogil, parallel = get_jit_arguments(engine_kwargs) + + check_kwargs_and_nopython(kwargs, nopython) + + numba_func = jit_user_function(func, nopython, nogil, parallel) + numba = import_optional_dependency("numba") if parallel: @@ -42,25 +55,6 @@ def make_rolling_apply( else: loop_range = range - if isinstance(func, numba.targets.registry.CPUDispatcher): - # Don't jit a user passed jitted function - numba_func = func - else: - - @numba.generated_jit(nopython=nopython, nogil=nogil, parallel=parallel) - def numba_func(window, *_args): - if getattr(np, func.__name__, False) is func or isinstance( - func, types.BuiltinFunctionType - ): - jf = func - else: - jf = numba.jit(func, nopython=nopython, nogil=nogil) - - def impl(window, *_args): - return jf(window, *_args) - - return impl - @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) def roll_apply( values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int, @@ -78,49 +72,3 @@ def roll_apply( return result return roll_apply - - -def generate_numba_apply_func( - args: Tuple, - kwargs: Dict[str, Any], - func: Callable[..., Scalar], - engine_kwargs: Optional[Dict[str, bool]], -): - """ - Generate a numba jitted apply function specified by values from engine_kwargs. - - 1. jit the user's function - 2. Return a rolling apply function with the jitted function inline - - Configurations specified in engine_kwargs apply to both the user's - function _AND_ the rolling apply function. - - Parameters - ---------- - args : tuple - *args to be passed into the function - kwargs : dict - **kwargs to be passed into the function - func : function - function to be applied to each window and will be JITed - engine_kwargs : dict - dictionary of arguments to be passed into numba.jit - - Returns - ------- - Numba function - """ - if engine_kwargs is None: - engine_kwargs = {} - - nopython = engine_kwargs.get("nopython", True) - nogil = engine_kwargs.get("nogil", False) - parallel = engine_kwargs.get("parallel", False) - - if kwargs and nopython: - raise ValueError( - "numba does not support kwargs with nopython=True: " - "https://github.com/numba/numba/issues/2916" - ) - - return make_rolling_apply(func, args, nogil, parallel, nopython) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 65ac064a1322e..3fdf81c4bb570 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -46,6 +46,7 @@ calculate_center_offset, calculate_min_periods, get_weighted_roll_func, + validate_baseindexer_support, zsqrt, ) from pandas.core.window.indexers import ( @@ -196,7 +197,7 @@ def _dir_additions(self): def _get_win_type(self, kwargs: Dict): """ - Exists for compatibility, overriden by subclass Window. + Exists for compatibility, overridden by subclass Window. Parameters ---------- @@ -267,8 +268,8 @@ def _prep_values(self, values: Optional[np.ndarray] = None) -> np.ndarray: else: try: values = ensure_float64(values) - except (ValueError, TypeError): - raise TypeError(f"cannot handle this type -> {values.dtype}") + except (ValueError, TypeError) as err: + raise TypeError(f"cannot handle this type -> {values.dtype}") from err # Convert inf to nan for C funcs inf = np.isinf(values) @@ -391,11 +392,12 @@ def _get_cython_func_type(self, func: str) -> Callable: return self._get_roll_func(f"{func}_variable") return partial(self._get_roll_func(f"{func}_fixed"), win=self._get_window()) - def _get_window_indexer(self, window: int) -> BaseIndexer: + def _get_window_indexer(self, window: int, func_name: Optional[str]) -> BaseIndexer: """ Return an indexer class that will compute the window start and end bounds """ if isinstance(self.window, BaseIndexer): + validate_baseindexer_support(func_name) return self.window if self.is_freq_type: return VariableWindowIndexer(index_array=self._on.asi8, window_size=window) @@ -441,7 +443,7 @@ def _apply( blocks, obj = self._create_blocks() block_list = list(blocks) - window_indexer = self._get_window_indexer(window) + window_indexer = self._get_window_indexer(window, name) results = [] exclude: List[Scalar] = [] @@ -449,13 +451,13 @@ def _apply( try: values = self._prep_values(b.values) - except (TypeError, NotImplementedError): + except (TypeError, NotImplementedError) as err: if isinstance(obj, ABCDataFrame): exclude.extend(b.columns) del block_list[i] continue else: - raise DataError("No numeric types to aggregate") + raise DataError("No numeric types to aggregate") from err if values.size == 0: results.append(values.copy()) @@ -898,6 +900,17 @@ class Window(_Window): 3 2.0 4 4.0 + Same as above, but with forward-looking windows + + >>> indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=2) + >>> df.rolling(window=indexer, min_periods=1).sum() + B + 0 1.0 + 1 3.0 + 2 2.0 + 3 4.0 + 4 4.0 + A ragged (meaning not-a-regular frequency), time-indexed DataFrame >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, @@ -1037,33 +1050,18 @@ def _get_window( """ Examples -------- - - >>> df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C']) + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) >>> df - A B C - 0 -2.385977 -0.102758 0.438822 - 1 -1.004295 0.905829 -0.954544 - 2 0.735167 -0.165272 -1.619346 - 3 -0.702657 -1.340923 -0.706334 - 4 -0.246845 0.211596 -0.901819 - 5 2.463718 3.157577 -1.380906 - 6 -1.142255 2.340594 -0.039875 - 7 1.396598 -1.647453 1.677227 - 8 -0.543425 1.761277 -0.220481 - 9 -0.640505 0.289374 -1.550670 - - >>> df.rolling(3, win_type='boxcar').agg('mean') - A B C - 0 NaN NaN NaN - 1 NaN NaN NaN - 2 -0.885035 0.212600 -0.711689 - 3 -0.323928 -0.200122 -1.093408 - 4 -0.071445 -0.431533 -1.075833 - 5 0.504739 0.676083 -0.996353 - 6 0.358206 1.903256 -0.774200 - 7 0.906020 1.283573 0.085482 - 8 -0.096361 0.818139 0.472290 - 9 0.070889 0.134399 -0.031308 + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 + + >>> df.rolling(2, win_type="boxcar").agg("mean") + A B C + 0 NaN NaN NaN + 1 1.5 4.5 7.5 + 2 2.5 5.5 8.5 """ ) @@ -1173,6 +1171,8 @@ class _Rolling_and_Expanding(_Rolling): ) def count(self): + if isinstance(self.window, BaseIndexer): + validate_baseindexer_support("count") blocks, obj = self._create_blocks() results = [] @@ -1627,6 +1627,9 @@ def quantile(self, quantile, interpolation="linear", **kwargs): """ def cov(self, other=None, pairwise=None, ddof=1, **kwargs): + if isinstance(self.window, BaseIndexer): + validate_baseindexer_support("cov") + if other is None: other = self._selected_obj # only default unset @@ -1770,6 +1773,9 @@ def _get_cov(X, Y): ) def corr(self, other=None, pairwise=None, **kwargs): + if isinstance(self.window, BaseIndexer): + validate_baseindexer_support("corr") + if other is None: other = self._selected_obj # only default unset @@ -1875,11 +1881,11 @@ def _validate_freq(self): try: return to_offset(self.window) - except (TypeError, ValueError): + except (TypeError, ValueError) as err: raise ValueError( f"passed window {self.window} is not " "compatible with a datetimelike index" - ) + ) from err _agg_see_also_doc = dedent( """ @@ -1894,46 +1900,24 @@ def _validate_freq(self): """ Examples -------- - - >>> df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C']) + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) >>> df - A B C - 0 -2.385977 -0.102758 0.438822 - 1 -1.004295 0.905829 -0.954544 - 2 0.735167 -0.165272 -1.619346 - 3 -0.702657 -1.340923 -0.706334 - 4 -0.246845 0.211596 -0.901819 - 5 2.463718 3.157577 -1.380906 - 6 -1.142255 2.340594 -0.039875 - 7 1.396598 -1.647453 1.677227 - 8 -0.543425 1.761277 -0.220481 - 9 -0.640505 0.289374 -1.550670 + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 - >>> df.rolling(3).sum() - A B C - 0 NaN NaN NaN - 1 NaN NaN NaN - 2 -2.655105 0.637799 -2.135068 - 3 -0.971785 -0.600366 -3.280224 - 4 -0.214334 -1.294599 -3.227500 - 5 1.514216 2.028250 -2.989060 - 6 1.074618 5.709767 -2.322600 - 7 2.718061 3.850718 0.256446 - 8 -0.289082 2.454418 1.416871 - 9 0.212668 0.403198 -0.093924 - - >>> df.rolling(3).agg({'A':'sum', 'B':'min'}) - A B - 0 NaN NaN - 1 NaN NaN - 2 -2.655105 -0.165272 - 3 -0.971785 -1.340923 - 4 -0.214334 -1.340923 - 5 1.514216 -1.340923 - 6 1.074618 0.211596 - 7 2.718061 -1.647453 - 8 -0.289082 -1.647453 - 9 0.212668 -1.647453 + >>> df.rolling(2).sum() + A B C + 0 NaN NaN NaN + 1 3.0 9.0 15.0 + 2 5.0 11.0 17.0 + + >>> df.rolling(2).agg({"A": "sum", "B": "min"}) + A B + 0 NaN NaN + 1 3.0 4.0 + 2 5.0 5.0 """ ) diff --git a/pandas/io/common.py b/pandas/io/common.py index c52583eed27ec..ff527de79c387 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -265,8 +265,8 @@ def get_compression_method( compression_args = dict(compression) try: compression = compression_args.pop("method") - except KeyError: - raise ValueError("If mapping, compression must have key 'method'") + except KeyError as err: + raise ValueError("If mapping, compression must have key 'method'") from err else: compression_args = {} return compression, compression_args @@ -351,8 +351,9 @@ def get_handle( 'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' and `filepath_or_buffer` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise - no compression). If dict and compression mode is 'zip' or inferred as - 'zip', other entries passed as additional compression options. + no compression). If dict and compression mode is one of + {'zip', 'gzip', 'bz2'}, or inferred as one of the above, + other entries passed as additional compression options. .. versionchanged:: 1.0.0 @@ -360,6 +361,11 @@ def get_handle( and other keys as compression options if compression mode is 'zip'. + .. versionchanged:: 1.1.0 + + Passing compression options as keys in dict is now + supported for compression modes 'gzip' and 'bz2' as well as 'zip'. + memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True @@ -394,19 +400,28 @@ def get_handle( if compression: + # GH33398 the type ignores here seem related to mypy issue #5382; + # it may be possible to remove them once that is resolved. + # GZ Compression if compression == "gzip": if is_path: - f = gzip.open(path_or_buf, mode) + f = gzip.open( + path_or_buf, mode, **compression_args # type: ignore + ) else: - f = gzip.GzipFile(fileobj=path_or_buf) + f = gzip.GzipFile( + fileobj=path_or_buf, **compression_args # type: ignore + ) # BZ Compression elif compression == "bz2": if is_path: - f = bz2.BZ2File(path_or_buf, mode) + f = bz2.BZ2File( + path_or_buf, mode, **compression_args # type: ignore + ) else: - f = bz2.BZ2File(path_or_buf) + f = bz2.BZ2File(path_or_buf, **compression_args) # type: ignore # ZIP Compression elif compression == "zip": diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 97959bd125113..d1139f640cef4 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,5 +1,5 @@ import abc -from datetime import date, datetime, timedelta +import datetime from io import BytesIO import os from textwrap import fill @@ -28,7 +28,6 @@ _pop_header_name, get_writer, ) -from pandas.io.formats.printing import pprint_thing from pandas.io.parsers import TextParser _read_excel_doc = ( @@ -367,6 +366,9 @@ def _workbook_class(self): def load_workbook(self, filepath_or_buffer): pass + def close(self): + pass + @property @abc.abstractmethod def sheet_names(self): @@ -628,8 +630,8 @@ def __new__(cls, path, engine=None, **kwargs): engine = config.get_option(f"io.excel.{ext}.writer") if engine == "auto": engine = _get_default_writer(ext) - except KeyError: - raise ValueError(f"No engine for filetype: '{ext}'") + except KeyError as err: + raise ValueError(f"No engine for filetype: '{ext}'") from err cls = get_writer(engine) return object.__new__(cls) @@ -742,11 +744,11 @@ def _value_with_fmt(self, val): val = float(val) elif is_bool(val): val = bool(val) - elif isinstance(val, datetime): + elif isinstance(val, datetime.datetime): fmt = self.datetime_format - elif isinstance(val, date): + elif isinstance(val, datetime.date): fmt = self.date_format - elif isinstance(val, timedelta): + elif isinstance(val, datetime.timedelta): val = val.total_seconds() / float(86400) fmt = "0" else: @@ -763,9 +765,7 @@ def check_extension(cls, ext): if ext.startswith("."): ext = ext[1:] if not any(ext in extension for extension in cls.supported_extensions): - msg = "Invalid extension for engine" - f"'{pprint_thing(cls.engine)}': '{pprint_thing(ext)}'" - raise ValueError(msg) + raise ValueError(f"Invalid extension for engine '{cls.engine}': '{ext}'") else: return True @@ -898,14 +898,7 @@ def sheet_names(self): def close(self): """close io if necessary""" - if self.engine == "openpyxl": - # https://stackoverflow.com/questions/31416842/ - # openpyxl-does-not-close-excel-workbook-in-read-only-mode - wb = self.book - wb._archive.close() - - if hasattr(self.io, "close"): - self.io.close() + self._reader.close() def __enter__(self): return self diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 7af776dc1a10f..739c77d1c0b99 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -171,7 +171,7 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: cell_value = cell.attributes.get((OFFICENS, "value")) return float(cell_value) elif cell_type == "string": - return str(cell) + return self._get_cell_string_value(cell) elif cell_type == "currency": cell_value = cell.attributes.get((OFFICENS, "value")) return float(cell_value) @@ -182,3 +182,28 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: return pd.to_datetime(str(cell)).time() else: raise ValueError(f"Unrecognized type {cell_type}") + + def _get_cell_string_value(self, cell) -> str: + """ + Find and decode OpenDocument text:s tags that represent + a run length encoded sequence of space characters. + """ + from odf.element import Text, Element + from odf.text import S, P + from odf.namespaces import TEXTNS + + text_p = P().qname + text_s = S().qname + + p = cell.childNodes[0] + + value = [] + if p.qname == text_p: + for k, fragment in enumerate(p.childNodes): + if isinstance(fragment, Text): + value.append(fragment.data) + elif isinstance(fragment, Element): + if fragment.qname == text_s: + spaces = int(fragment.attributes.get((TEXTNS, "c"), 1)) + value.append(" " * spaces) + return "".join(value) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index a96c0f814e2d8..0696d82e51f34 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -492,6 +492,11 @@ def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): filepath_or_buffer, read_only=True, data_only=True, keep_links=False ) + def close(self): + # https://stackoverflow.com/questions/31416842/ + # openpyxl-does-not-close-excel-workbook-in-read-only-mode + self.book.close() + @property def sheet_names(self) -> List[str]: return self.book.sheetnames diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index c8d40d7141fc8..7c8e1abb497bc 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -47,8 +47,8 @@ def _get_default_writer(ext): def get_writer(engine_name): try: return _writers[engine_name] - except KeyError: - raise ValueError(f"No Excel writer '{engine_name}'") + except KeyError as err: + raise ValueError(f"No Excel writer '{engine_name}'") from err def _excel2num(x): diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 5d4925620e75f..cd7045e7f2d2e 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -7,15 +7,18 @@ from pandas.io.common import stringify_path -def to_feather(df: DataFrame, path): +def to_feather(df: DataFrame, path, **kwargs): """ - Write a DataFrame to the feather-format + Write a DataFrame to the binary Feather format. Parameters ---------- df : DataFrame path : string file path, or file-like object + **kwargs : + Additional keywords passed to `pyarrow.feather.write_feather`. + .. versionadded:: 1.1.0 """ import_optional_dependency("pyarrow") from pyarrow import feather @@ -58,7 +61,7 @@ def to_feather(df: DataFrame, path): if df.columns.inferred_type not in valid_types: raise ValueError("feather must have string column names") - feather.write_feather(df, path) + feather.write_feather(df, path, **kwargs) def read_feather(path, columns=None, use_threads: bool = True): diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 0d581f30e50e7..091f7662630ff 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -131,8 +131,7 @@ def __init__( self.cols = cols # preallocate data 2d list - self.blocks = self.obj._data.blocks - ncols = sum(b.shape[0] for b in self.blocks) + ncols = self.obj.shape[-1] self.data = [None] * ncols if chunksize is None: @@ -327,10 +326,13 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: # create the data for a chunk slicer = slice(start_i, end_i) - for i in range(len(self.blocks)): - b = self.blocks[i] + + df = self.obj.iloc[slicer] + blocks = df._mgr.blocks + + for i in range(len(blocks)): + b = blocks[i] d = b.to_native_types( - slicer=slicer, na_rep=self.na_rep, float_format=self.float_format, decimal=self.decimal, diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index b5ddd15c1312a..59542a8da535e 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -58,11 +58,8 @@ ) from pandas.core.dtypes.generic import ( ABCDatetimeIndex, - ABCIndexClass, ABCMultiIndex, ABCPeriodIndex, - ABCSeries, - ABCSparseArray, ABCTimedeltaIndex, ) from pandas.core.dtypes.missing import isna, notna @@ -71,6 +68,7 @@ from pandas.core.arrays.timedeltas import TimedeltaArray from pandas.core.base import PandasObject import pandas.core.common as com +from pandas.core.construction import extract_array from pandas.core.indexes.api import Index, ensure_index from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex @@ -81,10 +79,10 @@ if TYPE_CHECKING: from pandas import Series, DataFrame, Categorical -formatters_type = Union[ +FormattersType = Union[ List[Callable], Tuple[Callable, ...], Mapping[Union[str, int], Callable] ] -float_format_type = Union[str, Callable, "EngFormatter"] +FloatFormatType = Union[str, Callable, "EngFormatter"] common_docstring = """ Parameters @@ -283,9 +281,7 @@ def _chk_truncate(self) -> None: series = series.iloc[:max_rows] else: row_num = max_rows // 2 - series = series._ensure_type( - concat((series.iloc[:row_num], series.iloc[-row_num:])) - ) + series = concat((series.iloc[:row_num], series.iloc[-row_num:])) self.tr_row_num = row_num else: self.tr_row_num = None @@ -455,7 +451,7 @@ class TableFormatter: show_dimensions: Union[bool, str] is_truncated: bool - formatters: formatters_type + formatters: FormattersType columns: Index @property @@ -548,9 +544,9 @@ def __init__( header: Union[bool, Sequence[str]] = True, index: bool = True, na_rep: str = "NaN", - formatters: Optional[formatters_type] = None, + formatters: Optional[FormattersType] = None, justify: Optional[str] = None, - float_format: Optional[float_format_type] = None, + float_format: Optional[FloatFormatType] = None, sparsify: Optional[bool] = None, index_names: bool = True, line_width: Optional[int] = None, @@ -1089,7 +1085,7 @@ def _get_column_name_list(self) -> List[str]: def format_array( values: Any, formatter: Optional[Callable], - float_format: Optional[float_format_type] = None, + float_format: Optional[FloatFormatType] = None, na_rep: str = "NaN", digits: Optional[int] = None, space: Optional[Union[str, int]] = None, @@ -1171,7 +1167,7 @@ def __init__( formatter: Optional[Callable] = None, na_rep: str = "NaN", space: Union[str, int] = 12, - float_format: Optional[float_format_type] = None, + float_format: Optional[FloatFormatType] = None, justify: str = "right", decimal: str = ".", quoting: Optional[int] = None, @@ -1230,11 +1226,7 @@ def _format(x): # object dtype return str(formatter(x)) - vals = self.values - if isinstance(vals, Index): - vals = vals._values - elif isinstance(vals, ABCSparseArray): - vals = vals.values + vals = extract_array(self.values, extract_numpy=True) is_float_type = lib.map_infer(vals, is_float) & notna(vals) leading_space = self.leading_space @@ -1278,7 +1270,7 @@ def __init__(self, *args, **kwargs): def _value_formatter( self, - float_format: Optional[float_format_type] = None, + float_format: Optional[FloatFormatType] = None, threshold: Optional[Union[float, int]] = None, ) -> Callable: """Returns a function to be applied on each value to format it""" @@ -1352,8 +1344,6 @@ def format_values_with(float_format): values = self.values is_complex = is_complex_dtype(values) mask = isna(values) - if hasattr(values, "to_dense"): # sparse numpy ndarray - values = values.to_dense() values = np.array(values, dtype="object") values[mask] = na_rep imask = (~mask).ravel() @@ -1372,7 +1362,7 @@ def format_values_with(float_format): # There is a special default string when we are fixed-width # The default is otherwise to use str instead of a formatting string - float_format: Optional[float_format_type] + float_format: Optional[FloatFormatType] if self.float_format is None: if self.fixed_width: float_format = partial( @@ -1461,9 +1451,7 @@ def _format_strings(self) -> List[str]: class ExtensionArrayFormatter(GenericArrayFormatter): def _format_strings(self) -> List[str]: - values = self.values - if isinstance(values, (ABCIndexClass, ABCSeries)): - values = values._values + values = extract_array(self.values, extract_numpy=True) formatter = values._formatter(boxed=True) @@ -1559,7 +1547,7 @@ def _is_dates_only( values: Union[np.ndarray, DatetimeArray, Index, DatetimeIndex] ) -> bool: # return a boolean if we are only dates (and don't have a timezone) - assert values.ndim == 1 + values = values.ravel() values = DatetimeIndex(values) if values.tz is not None: @@ -1684,14 +1672,9 @@ def _get_format_timedelta64( even_days = ( np.logical_and(consider_values, values_int % one_day_nanos != 0).sum() == 0 ) - all_sub_day = ( - np.logical_and(consider_values, np.abs(values_int) >= one_day_nanos).sum() == 0 - ) if even_days: format = None - elif all_sub_day: - format = "sub_day" else: format = "long" diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 585e1af3dbc01..1be0f977f9b20 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -101,7 +101,7 @@ def write_th( self, s: Any, header: bool = False, indent: int = 0, tags: Optional[str] = None ) -> None: """ - Method for writting a formatted cell. + Method for writing a formatted cell. If col_space is set on the formatter then that is used for the value of min-width. diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 0c08065f55273..1fbc321160120 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -265,7 +265,8 @@ def _sizeof_fmt(num, size_qualifier): else: _verbose_repr() - counts = data._data.get_dtype_counts() + # groupby dtype.name to collect e.g. Categorical columns + counts = data.dtypes.value_counts().groupby(lambda x: x.name).sum() dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] lines.append(f"dtypes: {', '.join(dtypes)}") diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 018441dacd9a8..fecdf3b758f0f 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -27,7 +27,7 @@ from pandas._libs import lib from pandas._typing import Axis, FrameOrSeries, FrameOrSeriesUnion, Label from pandas.compat._optional import import_optional_dependency -from pandas.util._decorators import Appender +from pandas.util._decorators import doc from pandas.core.dtypes.common import is_float @@ -35,7 +35,7 @@ from pandas.api.types import is_dict_like, is_list_like import pandas.core.common as com from pandas.core.frame import DataFrame -from pandas.core.generic import _shared_docs +from pandas.core.generic import NDFrame from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice jinja2 = import_optional_dependency("jinja2", extra="DataFrame.style requires jinja2.") @@ -192,18 +192,7 @@ def _repr_html_(self) -> str: """ return self.render() - @Appender( - _shared_docs["to_excel"] - % dict( - axes="index, columns", - klass="Styler", - axes_single_arg="{0 or 'index', 1 or 'columns'}", - optional_by=""" - by : str or list of str - Name or list of names which refer to the axis items.""", - versionadded_to_excel="\n .. versionadded:: 0.20", - ) - ) + @doc(NDFrame.to_excel, klass="Styler") def to_excel( self, excel_writer, @@ -776,8 +765,6 @@ def where( Updates the HTML representation with a style which is selected in accordance with the return value of a function. - .. versionadded:: 0.21.0 - Parameters ---------- cond : callable @@ -1003,19 +990,27 @@ def hide_columns(self, subset) -> "Styler": def _highlight_null(v, null_color: str) -> str: return f"background-color: {null_color}" if pd.isna(v) else "" - def highlight_null(self, null_color: str = "red") -> "Styler": + def highlight_null( + self, + null_color: str = "red", + subset: Optional[Union[Label, Sequence[Label]]] = None, + ) -> "Styler": """ Shade the background ``null_color`` for missing values. Parameters ---------- - null_color : str + null_color : str, default 'red' + subset : label or list of labels, default None + A valid slice for ``data`` to limit the style application to. + + .. versionadded:: 1.1.0 Returns ------- self : Styler """ - self.applymap(self._highlight_null, null_color=null_color) + self.applymap(self._highlight_null, null_color=null_color, subset=subset) return self def background_gradient( diff --git a/pandas/io/html.py b/pandas/io/html.py index 561570f466b68..c4ffe332e3020 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -11,6 +11,7 @@ from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError, EmptyDataError +from pandas.util._decorators import deprecate_nonkeyword_arguments from pandas.core.dtypes.common import is_list_like @@ -904,7 +905,7 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): "Since you passed a non-rewindable file " "object, we can't rewind it to try " "another parser. Try read_html() with a different flavor." - ) + ) from caught retained = caught else: @@ -921,6 +922,7 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): return ret +@deprecate_nonkeyword_arguments(version="2.0") def read_html( io, match=".+", @@ -1036,7 +1038,7 @@ def read_html( See Also -------- - read_csv + read_csv : Read a comma-separated values (csv) file into DataFrame. Notes ----- @@ -1057,8 +1059,6 @@ def read_html( the header, otherwise the function attempts to find the header within the body (by putting rows with only ```` elements into the header). - .. versionadded:: 0.21.0 - Similar to :func:`~read_csv` the `header` argument is applied **after** `skiprows` is applied. diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 77a0c2f99496b..20724a498b397 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -11,7 +11,7 @@ from pandas._libs.tslibs import iNaT from pandas._typing import JSONSerializable from pandas.errors import AbstractMethodError -from pandas.util._decorators import deprecate_kwarg +from pandas.util._decorators import deprecate_kwarg, deprecate_nonkeyword_arguments from pandas.core.dtypes.common import ensure_str, is_period_dtype @@ -345,6 +345,9 @@ def _write( @deprecate_kwarg(old_arg_name="numpy", new_arg_name=None) +@deprecate_nonkeyword_arguments( + version="2.0", allowed_args=["path_or_buf"], stacklevel=3 +) def read_json( path_or_buf=None, orient=None, @@ -490,9 +493,6 @@ def read_json( for more information on ``chunksize``. This can only be passed if `lines=True`. If this is None, the file will be read into memory all at once. - - .. versionadded:: 0.21.0 - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, zip or xz if path_or_buf is a string ending in @@ -500,8 +500,6 @@ def read_json( otherwise. If using 'zip', the ZIP file must contain only one data file to be read in. Set to None for no decompression. - .. versionadded:: 0.21.0 - Returns ------- Series or DataFrame @@ -973,9 +971,9 @@ def _try_convert_to_date(self, data): # ignore numbers that are out of range if issubclass(new_data.dtype.type, np.number): in_range = ( - isna(new_data.values) + isna(new_data._values) | (new_data > self.min_stamp) - | (new_data.values == iNaT) + | (new_data._values == iNaT) ) if not in_range.all(): return data, False @@ -984,7 +982,7 @@ def _try_convert_to_date(self, data): for date_unit in date_units: try: new_data = to_datetime(new_data, errors="raise", unit=date_unit) - except (ValueError, OverflowError): + except (ValueError, OverflowError, TypeError): continue return new_data, True return data, False diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index f158ad6cd89e3..69e9b111a6c20 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -8,6 +8,7 @@ import numpy as np from pandas._libs.writers import convert_json_to_lines +from pandas._typing import Scalar from pandas.util._decorators import deprecate import pandas as pd @@ -226,23 +227,36 @@ def _json_normalize( Returns normalized data with columns prefixed with the given string. """ - def _pull_field(js: Dict[str, Any], spec: Union[List, str]) -> Iterable: - result = js # type: ignore + def _pull_field( + js: Dict[str, Any], spec: Union[List, str] + ) -> Union[Scalar, Iterable]: + """Internal function to pull field""" + result = js if isinstance(spec, list): for field in spec: result = result[field] else: result = result[spec] + return result + + def _pull_records(js: Dict[str, Any], spec: Union[List, str]) -> Iterable: + """ + Interal function to pull field for records, and similar to + _pull_field, but require to return Iterable. And will raise error + if has non iterable value. + """ + result = _pull_field(js, spec) + # GH 31507 GH 30145, if result is not Iterable, raise TypeError if not + # null, otherwise return an empty list if not isinstance(result, Iterable): if pd.isnull(result): - result = [] # type: ignore + result = [] else: raise TypeError( f"{js} has non iterable value {result} for path {spec}. " "Must be iterable or null." ) - return result if isinstance(data, list) and not data: @@ -292,7 +306,7 @@ def _recursive_extract(data, path, seen_meta, level=0): _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1) else: for obj in data: - recs = _pull_field(obj, path[0]) + recs = _pull_records(obj, path[0]) recs = [ nested_to_record(r, sep=sep, max_level=max_level) if isinstance(r, dict) @@ -315,7 +329,7 @@ def _recursive_extract(data, path, seen_meta, level=0): raise KeyError( "Try running with errors='ignore' as key " f"{e} is not always present" - ) + ) from e meta_vals[key].append(meta_val) records.extend(recs) diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index 4e42533ca2744..6061af72901a5 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -6,6 +6,7 @@ import warnings import pandas._libs.json as json +from pandas._typing import DtypeObj from pandas.core.dtypes.common import ( is_bool_dtype, @@ -26,17 +27,17 @@ loads = json.loads -def as_json_table_type(x): +def as_json_table_type(x: DtypeObj) -> str: """ Convert a NumPy / pandas type to its corresponding json_table. Parameters ---------- - x : array or dtype + x : np.dtype or ExtensionDtype Returns ------- - t : str + str the Table Schema data types Notes @@ -96,8 +97,8 @@ def set_default_names(data): return data -def convert_pandas_type_to_json_field(arr, dtype=None): - dtype = dtype or arr.dtype +def convert_pandas_type_to_json_field(arr): + dtype = arr.dtype if arr.name is None: name = "values" else: diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 9ae9729fc05ee..33747d2a6dd83 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -18,20 +18,23 @@ def get_engine(engine: str) -> "BaseImpl": if engine == "auto": # try engines in this order - try: - return PyArrowImpl() - except ImportError: - pass + engine_classes = [PyArrowImpl, FastParquetImpl] - try: - return FastParquetImpl() - except ImportError: - pass + error_msgs = "" + for engine_class in engine_classes: + try: + return engine_class() + except ImportError as err: + error_msgs += "\n - " + str(err) raise ImportError( "Unable to find a usable engine; " "tried using: 'pyarrow', 'fastparquet'.\n" - "pyarrow or fastparquet is required for parquet support" + "A suitable version of " + "pyarrow or fastparquet is required for parquet " + "support.\n" + "Trying to import the above resulted in these errors:" + f"{error_msgs}" ) if engine == "pyarrow": @@ -105,9 +108,7 @@ def write( **kwargs, ) else: - self.api.parquet.write_table( - table, path, compression=compression, **kwargs, - ) + self.api.parquet.write_table(table, path, compression=compression, **kwargs) def read(self, path, columns=None, **kwargs): path, _, _, should_close = get_filepath_or_buffer(path) @@ -260,8 +261,6 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs): """ Load a parquet object from the file path, returning a DataFrame. - .. versionadded:: 0.21.0 - Parameters ---------- path : str, path object or file-like object @@ -287,8 +286,6 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs): 'pyarrow' is unavailable. columns : list, default=None If not None, only these columns will be read from the file. - - .. versionadded:: 0.21.1 **kwargs Any additional kwargs are passed to the engine. diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py old mode 100755 new mode 100644 index 8a3ad6cb45b57..2df81ba0aa51a --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -5,11 +5,12 @@ from collections import abc, defaultdict import csv import datetime -from io import BufferedIOBase, RawIOBase, StringIO, TextIOWrapper +from io import StringIO, TextIOWrapper +import itertools import re import sys from textwrap import fill -from typing import Any, Dict, Set +from typing import Any, Dict, Iterable, List, Set import warnings import numpy as np @@ -34,6 +35,7 @@ ensure_str, is_bool_dtype, is_categorical_dtype, + is_dict_like, is_dtype_equal, is_extension_array_dtype, is_file_like, @@ -814,8 +816,10 @@ def __init__(self, f, engine=None, **kwds): ): try: dialect_val = getattr(dialect, param) - except AttributeError: - raise ValueError(f"Invalid dialect {kwds['dialect']} provided") + except AttributeError as err: + raise ValueError( + f"Invalid dialect {kwds['dialect']} provided" + ) from err parser_default = _parser_defaults[param] provided = kwds.get(param, parser_default) @@ -1419,6 +1423,54 @@ def __init__(self, kwds): # keep references to file handles opened by the parser itself self.handles = [] + def _validate_parse_dates_presence(self, columns: List[str]) -> None: + """ + Check if parse_dates are in columns. + + If user has provided names for parse_dates, check if those columns + are available. + + Parameters + ---------- + columns : list + List of names of the dataframe. + + Raises + ------ + ValueError + If column to parse_date is not in dataframe. + + """ + cols_needed: Iterable + if is_dict_like(self.parse_dates): + cols_needed = itertools.chain(*self.parse_dates.values()) + elif is_list_like(self.parse_dates): + # a column in parse_dates could be represented + # ColReference = Union[int, str] + # DateGroups = List[ColReference] + # ParseDates = Union[DateGroups, List[DateGroups], + # Dict[ColReference, DateGroups]] + cols_needed = itertools.chain.from_iterable( + col if is_list_like(col) else [col] for col in self.parse_dates + ) + else: + cols_needed = [] + + # get only columns that are references using names (str), not by index + missing_cols = ", ".join( + sorted( + { + col + for col in cols_needed + if isinstance(col, str) and col not in columns + } + ) + ) + if missing_cols: + raise ValueError( + f"Missing column provided to 'parse_dates': '{missing_cols}'" + ) + def close(self): for f in self.handles: f.close() @@ -1816,19 +1868,19 @@ def _cast_types(self, values, cast_type, column): array_type = cast_type.construct_array_type() try: return array_type._from_sequence_of_strings(values, dtype=cast_type) - except NotImplementedError: + except NotImplementedError as err: raise NotImplementedError( f"Extension Array: {array_type} must implement " "_from_sequence_of_strings in order to be used in parser methods" - ) + ) from err else: try: values = astype_nansafe(values, cast_type, copy=True, skipna=True) - except ValueError: + except ValueError as err: raise ValueError( f"Unable to convert column {column} to type {cast_type}" - ) + ) from err return values def _do_date_conversions(self, names, data): @@ -1868,7 +1920,7 @@ def __init__(self, src, **kwds): # Handle the file object with universal line mode enabled. # We will handle the newline character ourselves later on. - if isinstance(src, (BufferedIOBase, RawIOBase)): + if hasattr(src, "read") and not hasattr(src, "encoding"): src = TextIOWrapper(src, encoding=encoding, newline="") kwds["encoding"] = "utf-8" @@ -1938,6 +1990,7 @@ def __init__(self, src, **kwds): if len(self.names) < len(usecols): _validate_usecols_names(usecols, self.names) + self._validate_parse_dates_presence(self.names) self._set_noconvert_columns() self.orig_names = self.names @@ -2271,11 +2324,15 @@ def __init__(self, f, **kwds): # Get columns in two steps: infer from data, then # infer column indices from self.usecols if it is specified. self._col_indices = None - ( - self.columns, - self.num_original_columns, - self.unnamed_cols, - ) = self._infer_columns() + try: + ( + self.columns, + self.num_original_columns, + self.unnamed_cols, + ) = self._infer_columns() + except (TypeError, ValueError): + self.close() + raise # Now self.columns has the set of columns that we will process. # The original set is stored in self.original_columns. @@ -2308,6 +2365,7 @@ def __init__(self, f, **kwds): if self.index_names is None: self.index_names = index_names + self._validate_parse_dates_presence(self.columns) if self.parse_dates: self._no_thousands_columns = self._set_no_thousands_columns() else: @@ -2377,19 +2435,21 @@ class MyDialect(csv.Dialect): dia = MyDialect - sniff_sep = True - if sep is not None: - sniff_sep = False dia.delimiter = sep - # attempt to sniff the delimiter - if sniff_sep: + else: + # attempt to sniff the delimiter from the first valid line, + # i.e. no comment line and not in skiprows line = f.readline() - while self.skipfunc(self.pos): + lines = self._check_comments([[line]])[0] + while self.skipfunc(self.pos) or not lines: self.pos += 1 line = f.readline() + lines = self._check_comments([[line]])[0] - line = self._check_comments([line])[0] + # since `line` was a string, lines will be a list containing + # only a single string + line = lines[0] self.pos += 1 self.line_pos += 1 @@ -2552,12 +2612,12 @@ def _infer_columns(self): while self.line_pos <= hr: line = self._next_line() - except StopIteration: + except StopIteration as err: if self.line_pos < hr: raise ValueError( f"Passed header={hr} but only {self.line_pos + 1} lines in " "file" - ) + ) from err # We have an empty file, so check # if columns are provided. That will @@ -2569,7 +2629,7 @@ def _infer_columns(self): return columns, num_original_columns, unnamed_cols if not self.names: - raise EmptyDataError("No columns to parse from file") + raise EmptyDataError("No columns to parse from file") from err line = self.names[:] @@ -2650,9 +2710,9 @@ def _infer_columns(self): try: line = self._buffered_line() - except StopIteration: + except StopIteration as err: if not names: - raise EmptyDataError("No columns to parse from file") + raise EmptyDataError("No columns to parse from file") from err line = names[:] diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 4e731b8ecca11..6faebf56a11ab 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -43,7 +43,6 @@ def to_pickle( HIGHEST_PROTOCOL. .. [1] https://docs.python.org/3/library/pickle.html - .. versionadded:: 0.21.0 See Also -------- diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 048aa8b1915d1..3dd87ae6ed758 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -8,17 +8,7 @@ import itertools import os import re -from typing import ( - TYPE_CHECKING, - Any, - Dict, - Hashable, - List, - Optional, - Tuple, - Type, - Union, -) +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union import warnings import numpy as np @@ -27,7 +17,7 @@ from pandas._libs import lib, writers as libwriters from pandas._libs.tslibs import timezones -from pandas._typing import ArrayLike, FrameOrSeries +from pandas._typing import ArrayLike, FrameOrSeries, Label from pandas.compat._optional import import_optional_dependency from pandas.errors import PerformanceWarning from pandas.util._decorators import cache_readonly @@ -316,9 +306,6 @@ def read_hdf( By file-like object, we refer to objects with a ``read()`` method, such as a file handler (e.g. via builtin ``open`` function) or ``StringIO``. - - .. versionadded:: 0.21.0 support for __fspath__ protocol. - key : object, optional The group identifier in the store. Can be omitted if the HDF file contains a single pandas object. @@ -682,7 +669,7 @@ def open(self, mode: str = "a", **kwargs): # trying to read from a non-existent file causes an error which # is not part of IOError, make it one if self._mode == "r" and "Unable to open/create file" in str(err): - raise IOError(str(err)) + raise IOError(str(err)) from err raise def close(self): @@ -1069,14 +1056,14 @@ def remove(self, key: str, where=None, start=None, stop=None): except AssertionError: # surface any assertion errors for e.g. debugging raise - except Exception: + except Exception as err: # In tests we get here with ClosedFileError, TypeError, and # _table_mod.NoSuchNodeError. TODO: Catch only these? if where is not None: raise ValueError( "trying to remove a node with a non-None where clause!" - ) + ) from err # we are actually trying to remove a node (with children) node = self.get_node(key) @@ -1472,8 +1459,6 @@ def info(self) -> str: """ Print detailed information on the store. - .. versionadded:: 0.21.0 - Returns ------- str @@ -1521,8 +1506,8 @@ def _validate_format(self, format: str) -> str: # validate try: format = _FORMAT_MAP[format.lower()] - except KeyError: - raise TypeError(f"invalid HDFStore format specified [{format}]") + except KeyError as err: + raise TypeError(f"invalid HDFStore format specified [{format}]") from err return format @@ -1579,8 +1564,8 @@ def error(t): _STORER_MAP = {"series": SeriesFixed, "frame": FrameFixed} try: cls = _STORER_MAP[pt] - except KeyError: - raise error("_STORER_MAP") + except KeyError as err: + raise error("_STORER_MAP") from err return cls(self, group, encoding=encoding, errors=errors) # existing node (and must be a table) @@ -1614,8 +1599,8 @@ def error(t): } try: cls = _TABLE_MAP[tt] - except KeyError: - raise error("_TABLE_MAP") + except KeyError as err: + raise error("_TABLE_MAP") from err return cls(self, group, encoding=encoding, errors=errors) @@ -1931,9 +1916,7 @@ def is_indexed(self) -> bool: if not hasattr(self.table, "cols"): # e.g. if infer hasn't been called yet, self.table will be None. return False - # GH#29692 mypy doesn't recognize self.table as having a "cols" attribute - # 'error: "None" has no attribute "cols"' - return getattr(self.table.cols, self.cname).is_indexed # type: ignore + return getattr(self.table.cols, self.cname).is_indexed def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): """ @@ -2212,7 +2195,7 @@ def __eq__(self, other: Any) -> bool: for a in ["name", "cname", "dtype", "pos"] ) - def set_data(self, data: Union[np.ndarray, ABCExtensionArray]): + def set_data(self, data: ArrayLike): assert data is not None assert self.dtype is None @@ -2392,7 +2375,7 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): mask = isna(categories) if mask.any(): categories = categories[~mask] - codes[codes != -1] -= mask.astype(int).cumsum().values + codes[codes != -1] -= mask.astype(int).cumsum()._values converted = Categorical.from_codes( codes, categories=categories, ordered=ordered @@ -2811,7 +2794,7 @@ def read_multi_index( levels = [] codes = [] - names: List[Optional[Hashable]] = [] + names: List[Label] = [] for i in range(nlevels): level_key = f"{key}_level{i}" node = getattr(self.group, level_key) @@ -2976,7 +2959,7 @@ class SeriesFixed(GenericFixed): pandas_kind = "series" attributes = ["name"] - name: Optional[Hashable] + name: Label @property def shape(self): @@ -3075,7 +3058,7 @@ def read( def write(self, obj, **kwargs): super().write(obj, **kwargs) - data = obj._data + data = obj._mgr if not data.is_consolidated(): data = data.consolidate() @@ -3233,10 +3216,10 @@ def validate_multiindex(self, obj): ] try: return obj.reset_index(), levels - except ValueError: + except ValueError as err: raise ValueError( "duplicate names/columns in the multi-index when storing as a table" - ) + ) from err @property def nrows_expected(self) -> int: @@ -3784,11 +3767,11 @@ def get_blk_items(mgr, blocks): if table_exists and validate: try: existing_col = self.values_axes[i] - except (IndexError, KeyError): + except (IndexError, KeyError) as err: raise ValueError( f"Incompatible appended table [{blocks}]" f"with existing table [{self.values_axes}]" - ) + ) from err else: existing_col = None @@ -3870,18 +3853,18 @@ def _get_blocks_and_items( def get_blk_items(mgr, blocks): return [mgr.items.take(blk.mgr_locs) for blk in blocks] - blocks = block_obj._data.blocks - blk_items = get_blk_items(block_obj._data, blocks) + blocks = block_obj._mgr.blocks + blk_items = get_blk_items(block_obj._mgr, blocks) if len(data_columns): axis, axis_labels = new_non_index_axes[0] new_labels = Index(axis_labels).difference(Index(data_columns)) - mgr = block_obj.reindex(new_labels, axis=axis)._data + mgr = block_obj.reindex(new_labels, axis=axis)._mgr blocks = list(mgr.blocks) blk_items = get_blk_items(mgr, blocks) for c in data_columns: - mgr = block_obj.reindex([c], axis=axis)._data + mgr = block_obj.reindex([c], axis=axis)._mgr blocks.extend(mgr.blocks) blk_items.extend(get_blk_items(mgr, mgr.blocks)) @@ -3899,12 +3882,12 @@ def get_blk_items(mgr, blocks): b, b_items = by_items.pop(items) new_blocks.append(b) new_blk_items.append(b_items) - except (IndexError, KeyError): + except (IndexError, KeyError) as err: jitems = ",".join(pprint_thing(item) for item in items) raise ValueError( f"cannot match existing table structure for [{jitems}] " "on appending data" - ) + ) from err blocks = new_blocks blk_items = new_blk_items @@ -4692,7 +4675,7 @@ def _convert_index(name: str, index: Index, encoding: str, errors: str) -> Index raise TypeError("MultiIndex not supported here!") inferred_type = lib.infer_dtype(index, skipna=False) - # we wont get inferred_type of "datetime64" or "timedelta64" as these + # we won't get inferred_type of "datetime64" or "timedelta64" as these # would go through the DatetimeIndex/TimedeltaIndex paths above values = np.asarray(index) @@ -4836,7 +4819,9 @@ def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.nd # encode if needed if len(data): data = ( - Series(data.ravel()).str.encode(encoding, errors).values.reshape(data.shape) + Series(data.ravel()) + .str.encode(encoding, errors) + ._values.reshape(data.shape) ) # create the sized dtype @@ -4875,7 +4860,7 @@ def _unconvert_string_array( dtype = f"U{itemsize}" if isinstance(data[0], bytes): - data = Series(data).str.decode(encoding, errors=errors).values + data = Series(data).str.decode(encoding, errors=errors)._values else: data = data.astype(dtype, copy=False).astype(object, copy=False) @@ -4969,11 +4954,11 @@ def _dtype_to_kind(dtype_str: str) -> str: return kind -def _get_data_and_dtype_name(data: Union[np.ndarray, ABCExtensionArray]): +def _get_data_and_dtype_name(data: ArrayLike): """ Convert the passed data into a storable form and a dtype string. """ - if is_categorical_dtype(data.dtype): + if isinstance(data, Categorical): data = data.codes # For datetime64tz we need to drop the TZ in tests TODO: why? @@ -5061,7 +5046,7 @@ def generate(self, where): q = self.table.queryables() try: return PyTablesExpr(where, queryables=q, encoding=self.table.encoding) - except NameError: + except NameError as err: # raise a nice message, suggesting that the user should use # data_columns qkeys = ",".join(q.keys()) @@ -5073,7 +5058,7 @@ def generate(self, where): " an axis (e.g. 'index' or 'columns'), or a " "data_column\n" f" The currently defined references are: {qkeys}\n" - ) + ) from err def select(self): """ diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index 40fea0aaf0d07..0038e39e2ffcc 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -120,7 +120,7 @@ cdef const uint8_t[:] rdc_decompress(int result_length, const uint8_t[:] inbuff) cdef: uint8_t cmd - uint16_t ctrl_bits, ctrl_mask = 0, ofs, cnt + uint16_t ctrl_bits = 0, ctrl_mask = 0, ofs, cnt int rpos = 0, k uint8_t[:] outbuff = np.zeros(result_length, dtype=np.uint8) Py_ssize_t ipos = 0, length = len(inbuff) @@ -431,7 +431,7 @@ cdef class Parser: elif column_types[j] == column_type_string: # string string_chunk[js, current_row] = np.array(source[start:( - start + lngt)]).tostring().rstrip(b"\x00 ") + start + lngt)]).tobytes().rstrip(b"\x00 ") js += 1 self.current_row_on_page_index += 1 diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index e67d68f7e0975..85b7fd497cedd 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -9,7 +9,6 @@ """ from collections import abc from datetime import datetime -from io import BytesIO import struct import warnings @@ -263,13 +262,9 @@ def __init__( if isinstance(filepath_or_buffer, (str, bytes)): self.filepath_or_buffer = open(filepath_or_buffer, "rb") else: - # Copy to BytesIO, and ensure no encoding - contents = filepath_or_buffer.read() - try: - contents = contents.encode(self._encoding) - except UnicodeEncodeError: - pass - self.filepath_or_buffer = BytesIO(contents) + # Since xport files include non-text byte sequences, xport files + # should already be opened in binary mode in Python 3. + self.filepath_or_buffer = filepath_or_buffer self._read_header() diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 9a53e7cd241e1..c657a925a5eab 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -313,7 +313,7 @@ def read_sql_query( See Also -------- read_sql_table : Read SQL database table into a DataFrame. - read_sql + read_sql : Read SQL query or database table into a DataFrame. Notes ----- @@ -692,29 +692,25 @@ def insert_data(self): column_names = list(map(str, temp.columns)) ncols = len(column_names) data_list = [None] * ncols - blocks = temp._data.blocks - - for b in blocks: - if b.is_datetime: - # return datetime.datetime objects - if b.is_datetimetz: - # GH 9086: Ensure we return datetimes with timezone info - # Need to return 2-D data; DatetimeIndex is 1D - d = b.values.to_pydatetime() - d = np.atleast_2d(d) - else: - # convert to microsecond resolution for datetime.datetime - d = b.values.astype("M8[us]").astype(object) + + for i, (_, ser) in enumerate(temp.items()): + vals = ser._values + if vals.dtype.kind == "M": + d = vals.to_pydatetime() + elif vals.dtype.kind == "m": + # store as integers, see GH#6921, GH#7076 + d = vals.view("i8").astype(object) else: - d = np.array(b.get_values(), dtype=object) + d = vals.astype(object) + + assert isinstance(d, np.ndarray), type(d) - # replace NaN with None - if b._can_hold_na: + if ser._can_hold_na: + # Note: this will miss timedeltas since they are converted to int mask = isna(d) d[mask] = None - for col_loc, col in zip(b.mgr_locs, d): - data_list[col_loc] = col + data_list[i] = d return column_names, data_list @@ -970,7 +966,8 @@ def _sqlalchemy_type(self, col): return TIMESTAMP(timezone=True) except AttributeError: # The column is actually a DatetimeIndex - if col.tz is not None: + # GH 26761 or an Index with date-like data e.g. 9999-01-01 + if getattr(col, "tz", None) is not None: return TIMESTAMP(timezone=True) return DateTime if col_type == "timedelta64": diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 0397dfa923afb..b9b43685415d1 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -351,15 +351,14 @@ def _datetime_to_stata_elapsed_vec(dates: Series, fmt: str) -> Series: def parse_dates_safe(dates, delta=False, year=False, days=False): d = {} - if is_datetime64_dtype(dates.values): + if is_datetime64_dtype(dates.dtype): if delta: time_delta = dates - stata_epoch - d["delta"] = time_delta.values.astype(np.int64) // 1000 # microseconds + d["delta"] = time_delta._values.astype(np.int64) // 1000 # microseconds if days or year: - # ignore since mypy reports that DatetimeIndex has no year/month date_index = DatetimeIndex(dates) - d["year"] = date_index.year # type: ignore - d["month"] = date_index.month # type: ignore + d["year"] = date_index.year + d["month"] = date_index.month if days: days_in_ns = dates.astype(np.int64) - to_datetime( d["year"], format="%Y" @@ -368,7 +367,7 @@ def parse_dates_safe(dates, delta=False, year=False, days=False): elif infer_dtype(dates, skipna=False) == "datetime": if delta: - delta = dates.values - stata_epoch + delta = dates._values - stata_epoch def f(x: datetime.timedelta) -> float: return US_PER_DAY * x.days + 1000000 * x.seconds + x.microseconds @@ -377,8 +376,8 @@ def f(x: datetime.timedelta) -> float: d["delta"] = v(delta) if year: year_month = dates.apply(lambda x: 100 * x.year + x.month) - d["year"] = year_month.values // 100 - d["month"] = year_month.values - d["year"] * 100 + d["year"] = year_month._values // 100 + d["month"] = year_month._values - d["year"] * 100 if days: def g(x: datetime.datetime) -> int: @@ -1672,7 +1671,7 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra continue if convert_missing: # Replacement follows Stata notation - missing_loc = np.nonzero(missing._ndarray_values)[0] + missing_loc = np.nonzero(np.asarray(missing))[0] umissing, umissing_loc = np.unique(series[missing], return_inverse=True) replacement = Series(series, dtype=np.object) for j, um in enumerate(umissing): @@ -1956,7 +1955,7 @@ def _dtype_to_stata_type(dtype: np.dtype, column: Series) -> int: if dtype.type == np.object_: # try to coerce it to the biggest string # not memory efficient, what else could we # do? - itemsize = max_len_string_array(ensure_object(column.values)) + itemsize = max_len_string_array(ensure_object(column._values)) return max(itemsize, 1) elif dtype == np.float64: return 255 @@ -1998,7 +1997,7 @@ def _dtype_to_default_stata_fmt( if force_strl: return "%9s" if dtype.type == np.object_: - itemsize = max_len_string_array(ensure_object(column.values)) + itemsize = max_len_string_array(ensure_object(column._values)) if itemsize > max_str_len: if dta_version >= 117: return "%9s" @@ -2151,7 +2150,7 @@ def _prepare_categoricals(self, data: DataFrame) -> DataFrame: "It is not possible to export " "int64-based categorical data to Stata." ) - values = data[col].cat.codes.values.copy() + values = data[col].cat.codes._values.copy() # Upcast if needed so that correct missing values can be set if values.max() >= get_base_missing_value(dtype): @@ -2384,7 +2383,7 @@ def _encode_strings(self) -> None: encoded = self.data[col].str.encode(self._encoding) # If larger than _max_string_length do nothing if ( - max_len_string_array(ensure_object(encoded.values)) + max_len_string_array(ensure_object(encoded._values)) <= self._max_string_length ): self.data[col] = encoded @@ -2650,7 +2649,7 @@ def _dtype_to_stata_type_117(dtype: np.dtype, column: Series, force_strl: bool) if dtype.type == np.object_: # try to coerce it to the biggest string # not memory efficient, what else could we # do? - itemsize = max_len_string_array(ensure_object(column.values)) + itemsize = max_len_string_array(ensure_object(column._values)) itemsize = max(itemsize, 1) if itemsize <= 2045: return itemsize diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index d3db539084609..e466a215091ea 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1468,15 +1468,19 @@ def scatter(self, x, y, s=None, c=None, **kwargs): y : int or str The column name or column position to be used as vertical coordinates for each point. - s : scalar or array_like, optional + s : str, scalar or array_like, optional The size of each point. Possible values are: + - A string with the name of the column to be used for marker's size. + - A single scalar so all points have the same size. - A sequence of scalars, which will be used for each point's size recursively. For instance, when passing [2,14] all points size will be either 2 or 14, alternatively. + .. versionchanged:: 1.1.0 + c : str, int or array_like, optional The color of each point. Possible values are: diff --git a/pandas/plotting/_matplotlib/compat.py b/pandas/plotting/_matplotlib/compat.py index e7855068334f7..f2c5032112bc9 100644 --- a/pandas/plotting/_matplotlib/compat.py +++ b/pandas/plotting/_matplotlib/compat.py @@ -20,3 +20,4 @@ def inner(): _mpl_ge_2_2_3 = _mpl_version("2.2.3", operator.ge) _mpl_ge_3_0_0 = _mpl_version("3.0.0", operator.ge) _mpl_ge_3_1_0 = _mpl_version("3.1.0", operator.ge) +_mpl_ge_3_2_0 = _mpl_version("3.2.0", operator.ge) diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index c399e5b9b7017..8260684c02ea6 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -218,13 +218,13 @@ def _convert_1d(values, units, axis): if isinstance(values, valid_types) or is_integer(values) or is_float(values): return get_datevalue(values, axis.freq) elif isinstance(values, PeriodIndex): - return values.asfreq(axis.freq)._ndarray_values + return values.asfreq(axis.freq).asi8 elif isinstance(values, Index): return values.map(lambda x: get_datevalue(x, axis.freq)) elif lib.infer_dtype(values, skipna=False) == "period": # https://github.com/pandas-dev/pandas/issues/24304 # convert ndarray[period] -> PeriodIndex - return PeriodIndex(values, freq=axis.freq)._ndarray_values + return PeriodIndex(values, freq=axis.freq).asi8 elif isinstance(values, (list, tuple, np.ndarray, Index)): return [get_datevalue(x, axis.freq) for x in values] return values @@ -607,7 +607,7 @@ def _daily_finder(vmin, vmax, freq): info = np.zeros( span, dtype=[("val", np.int64), ("maj", bool), ("min", bool), ("fmt", "|S20")] ) - info["val"][:] = dates_._ndarray_values + info["val"][:] = dates_.asi8 info["fmt"][:] = "" info["maj"][[0, -1]] = True # .. and set some shortcuts diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 63d0b8abe59d9..46941e437a4ce 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -902,7 +902,11 @@ def _plot_colorbar(self, ax, **kwds): # For a more detailed description of the issue # see the following link: # https://github.com/ipython/ipython/issues/11215 - img = ax.collections[0] + + # GH33389, if ax is used multiple times, we should always + # use the last one which contains the latest information + # about the ax + img = ax.collections[-1] cbar = self.fig.colorbar(img, ax=ax, **kwds) if _mpl_ge_3_0_0(): @@ -934,6 +938,8 @@ def __init__(self, data, x, y, s=None, c=None, **kwargs): # hide the matplotlib default for size, in case we want to change # the handling of this argument later s = 20 + elif is_hashable(s) and s in data.columns: + s = data[s] super().__init__(data, x, y, s=s, **kwargs) if is_integer(c) and not self.data.columns.holds_integer(): c = self.data.columns[c] diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index d54fc73b495ba..3a0cdc90dfd5c 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -28,10 +28,7 @@ def _args_adjust(self): values = values[~isna(values)] _, self.bins = np.histogram( - values, - bins=self.bins, - range=self.kwds.get("range", None), - weights=self.kwds.get("weights", None), + values, bins=self.bins, range=self.kwds.get("range", None) ) if is_list_like(self.bottom): @@ -77,6 +74,14 @@ def _make_plot(self): kwds["style"] = style kwds = self._make_plot_keywords(kwds, y) + + # We allow weights to be a multi-dimensional array, e.g. a (10, 2) array, + # and each sub-array (10,) will be called in each iteration. If users only + # provide 1D array, we assume the same weights is used for all iterations + weights = kwds.get("weights", None) + if weights is not None and np.ndim(weights) != 1: + kwds["weights"] = weights[:, i] + artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds) self._add_legend_handle(artists[0], label, index=i) diff --git a/pandas/plotting/_matplotlib/misc.py b/pandas/plotting/_matplotlib/misc.py index 0720f544203f7..7319e8de3ec6e 100644 --- a/pandas/plotting/_matplotlib/misc.py +++ b/pandas/plotting/_matplotlib/misc.py @@ -260,6 +260,7 @@ def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): import matplotlib.pyplot as plt + # TODO: is the failure mentioned below still relevant? # random.sample(ndarray, int) fails on python 3.3, sigh data = list(series.values) samplings = [random.sample(data, size) for _ in range(samples)] diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index 5743288982da4..08d945f679810 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -9,6 +9,8 @@ from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries +from pandas.plotting._matplotlib import compat + def format_date_labels(ax, rot): # mini version of autofmt_xdate @@ -288,6 +290,12 @@ def _remove_labels_from_axis(axis): def _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey): if nplots > 1: + if compat._mpl_ge_3_2_0(): + row_num = lambda x: x.get_subplotspec().rowspan.start + col_num = lambda x: x.get_subplotspec().colspan.start + else: + row_num = lambda x: x.rowNum + col_num = lambda x: x.colNum if nrows > 1: try: @@ -295,13 +303,13 @@ def _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey): # so that we can correctly handle 'gaps" layout = np.zeros((nrows + 1, ncols + 1), dtype=np.bool) for ax in axarr: - layout[ax.rowNum, ax.colNum] = ax.get_visible() + layout[row_num(ax), col_num(ax)] = ax.get_visible() for ax in axarr: # only the last row of subplots should get x labels -> all # other off layout handles the case that the subplot is # the last in the column, because below is no subplot/gap. - if not layout[ax.rowNum + 1, ax.colNum]: + if not layout[row_num(ax) + 1, col_num(ax)]: continue if sharex or len(ax.get_shared_x_axes().get_siblings(ax)) > 1: _remove_labels_from_axis(ax.xaxis) diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 47a4fd8ff0e95..30c5ba0ed94b6 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -29,10 +29,10 @@ def table(ax, data, rowLabels=None, colLabels=None, **kwargs): def register(): """ - Register Pandas Formatters and Converters with matplotlib. + Register pandas formatters and converters with matplotlib. This function modifies the global ``matplotlib.units.registry`` - dictionary. Pandas adds custom converters for + dictionary. pandas adds custom converters for * pd.Timestamp * pd.Period @@ -43,7 +43,7 @@ def register(): See Also -------- - deregister_matplotlib_converters + deregister_matplotlib_converters : Remove pandas formatters and converters. """ plot_backend = _get_plot_backend("matplotlib") plot_backend.register() @@ -51,7 +51,7 @@ def register(): def deregister(): """ - Remove pandas' formatters and converters. + Remove pandas formatters and converters. Removes the custom converters added by :func:`register`. This attempts to set the state of the registry back to the state before @@ -62,7 +62,8 @@ def deregister(): See Also -------- - register_matplotlib_converters + register_matplotlib_converters : Register pandas formatters and converters + with matplotlib. """ plot_backend = _get_plot_backend("matplotlib") plot_backend.deregister() @@ -155,7 +156,7 @@ def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): Parameters ---------- frame : `DataFrame` - Pandas object holding the data. + pandas object holding the data. class_column : str Column name containing the name of the data point category. ax : :class:`matplotlib.axes.Axes`, optional @@ -270,7 +271,7 @@ def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): Parameters ---------- series : pandas.Series - Pandas Series from where to get the samplings for the bootstrapping. + pandas Series from where to get the samplings for the bootstrapping. fig : matplotlib.figure.Figure, default None If given, it will use the `fig` reference for plotting instead of creating a new one with default parameters. @@ -359,7 +360,7 @@ def parallel_coordinates( -------- >>> from matplotlib import pyplot as plt >>> df = pd.read_csv('https://raw.github.com/pandas-dev/pandas/master' - '/pandas/tests/data/csv/iris.csv') + '/pandas/tests/data/iris.csv') >>> pd.plotting.parallel_coordinates( df, 'Name', color=('#556270', '#4ECDC4', '#C7F464')) diff --git a/pandas/tests/arithmetic/test_array_ops.py b/pandas/tests/arithmetic/test_array_ops.py index d8aaa3183a1c6..53cb10ba9fc5e 100644 --- a/pandas/tests/arithmetic/test_array_ops.py +++ b/pandas/tests/arithmetic/test_array_ops.py @@ -4,7 +4,7 @@ import pytest import pandas._testing as tm -from pandas.core.ops.array_ops import na_logical_op +from pandas.core.ops.array_ops import comparison_op, na_logical_op def test_na_logical_op_2d(): @@ -19,3 +19,18 @@ def test_na_logical_op_2d(): result = na_logical_op(left, right, operator.or_) expected = right tm.assert_numpy_array_equal(result, expected) + + +def test_object_comparison_2d(): + left = np.arange(9).reshape(3, 3).astype(object) + right = left.T + + result = comparison_op(left, right, operator.eq) + expected = np.eye(3).astype(bool) + tm.assert_numpy_array_equal(result, expected) + + # Ensure that cython doesn't raise on non-writeable arg, which + # we can get from np.broadcast_to + right.flags.writeable = False + result = comparison_op(left, right, operator.ne) + tm.assert_numpy_array_equal(result, ~expected) diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index d3f9ac4f3f8b2..56c5647d865d3 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1,7 +1,7 @@ # Arithmetic tests for DataFrame/Series/Index/Array classes that should # behave identically. # Specifically for datetime64 and datetime64tz dtypes -from datetime import datetime, timedelta +from datetime import datetime, time, timedelta from itertools import product, starmap import operator import warnings @@ -27,6 +27,7 @@ date_range, ) import pandas._testing as tm +from pandas.core.arrays import DatetimeArray, TimedeltaArray from pandas.core.ops import roperator from pandas.tests.arithmetic.common import ( assert_invalid_addsub_type, @@ -733,7 +734,7 @@ def test_dti_cmp_object_dtype(self): result = dti == other expected = np.array([True] * 5 + [False] * 5) tm.assert_numpy_array_equal(result, expected) - msg = "Cannot compare type" + msg = ">=' not supported between instances of 'Timestamp' and 'Timedelta'" with pytest.raises(TypeError, match=msg): dti >= other @@ -956,6 +957,18 @@ def test_dt64arr_sub_NaT(self, box_with_array): # ------------------------------------------------------------- # Subtraction of datetime-like array-like + def test_dt64arr_sub_dt64object_array(self, box_with_array, tz_naive_fixture): + dti = pd.date_range("2016-01-01", periods=3, tz=tz_naive_fixture) + expected = dti - dti + + obj = tm.box_expected(dti, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + warn = PerformanceWarning if box_with_array is not pd.DataFrame else None + with tm.assert_produces_warning(warn): + result = obj - obj.astype(object) + tm.assert_equal(result, expected) + def test_dt64arr_naive_sub_dt64ndarray(self, box_with_array): dti = pd.date_range("2016-01-01", periods=3, tz=None) dt64vals = dti.values @@ -1019,6 +1032,8 @@ def test_dt64arr_add_timestamp_raises(self, box_with_array): np.array([2.0, 3.0]), # GH#13078 datetime +/- Period is invalid pd.Period("2011-01-01", freq="D"), + # https://github.com/pandas-dev/pandas/issues/10329 + time(1, 2, 3), ], ) @pytest.mark.parametrize("dti_freq", [None, "D"]) @@ -1056,6 +1071,60 @@ def test_dt64arr_add_sub_parr( ) assert_invalid_addsub_type(dtarr, parr, msg) + def test_dt64arr_addsub_time_objects_raises(self, box_with_array, tz_naive_fixture): + # https://github.com/pandas-dev/pandas/issues/10329 + + tz = tz_naive_fixture + + obj1 = pd.date_range("2012-01-01", periods=3, tz=tz) + obj2 = [time(i, i, i) for i in range(3)] + + obj1 = tm.box_expected(obj1, box_with_array) + obj2 = tm.box_expected(obj2, box_with_array) + + with warnings.catch_warnings(record=True): + # pandas.errors.PerformanceWarning: Non-vectorized DateOffset being + # applied to Series or DatetimeIndex + # we aren't testing that here, so ignore. + warnings.simplefilter("ignore", PerformanceWarning) + + # If `x + y` raises, then `y + x` should raise here as well + + msg = ( + r"unsupported operand type\(s\) for -: " + "'(Timestamp|DatetimeArray)' and 'datetime.time'" + ) + with pytest.raises(TypeError, match=msg): + obj1 - obj2 + + msg = "|".join( + [ + "cannot subtract DatetimeArray from ndarray", + "ufunc (subtract|'subtract') cannot use operands with types " + r"dtype\('O'\) and dtype\('"], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + # no missing values + arr = pd.array([True, False, True], dtype="boolean") + result = arr.astype("int64") + expected = np.array([1, 0, 1], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.astype("bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + +def test_astype_to_boolean_array(): + # astype to BooleanArray + arr = pd.array([True, False, None], dtype="boolean") + + result = arr.astype("boolean") + tm.assert_extension_array_equal(result, arr) + result = arr.astype(pd.BooleanDtype()) + tm.assert_extension_array_equal(result, arr) + + +def test_astype_to_integer_array(): + # astype to IntegerArray + arr = pd.array([True, False, None], dtype="boolean") + + result = arr.astype("Int64") + expected = pd.array([1, 0, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/boolean/test_comparison.py b/pandas/tests/arrays/boolean/test_comparison.py new file mode 100644 index 0000000000000..726b78fbd43bd --- /dev/null +++ b/pandas/tests/arrays/boolean/test_comparison.py @@ -0,0 +1,94 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.arrays import BooleanArray +from pandas.tests.extension.base import BaseOpsUtil + + +@pytest.fixture +def data(): + return pd.array( + [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False], + dtype="boolean", + ) + + +class TestComparisonOps(BaseOpsUtil): + def _compare_other(self, data, op_name, other): + op = self.get_op_from_name(op_name) + + # array + result = pd.Series(op(data, other)) + expected = pd.Series(op(data._data, other), dtype="boolean") + # propagate NAs + expected[data._mask] = pd.NA + + tm.assert_series_equal(result, expected) + + # series + s = pd.Series(data) + result = op(s, other) + + expected = pd.Series(data._data) + expected = op(expected, other) + expected = expected.astype("boolean") + # propagate NAs + expected[data._mask] = pd.NA + + tm.assert_series_equal(result, expected) + + def test_compare_scalar(self, data, all_compare_operators): + op_name = all_compare_operators + self._compare_other(data, op_name, True) + + def test_compare_array(self, data, all_compare_operators): + op_name = all_compare_operators + other = pd.array([True] * len(data), dtype="boolean") + self._compare_other(data, op_name, other) + other = np.array([True] * len(data)) + self._compare_other(data, op_name, other) + other = pd.Series([True] * len(data)) + self._compare_other(data, op_name, other) + + @pytest.mark.parametrize("other", [True, False, pd.NA]) + def test_scalar(self, other, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([True, False, None], dtype="boolean") + + result = op(a, other) + + if other is pd.NA: + expected = pd.array([None, None, None], dtype="boolean") + else: + values = op(a._data, other) + expected = BooleanArray(values, a._mask, copy=True) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = None + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + def test_array(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + + result = op(a, b) + + values = op(a._data, b._data) + mask = a._mask | b._mask + expected = BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = None + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py new file mode 100644 index 0000000000000..bf1aba190f3e2 --- /dev/null +++ b/pandas/tests/arrays/boolean/test_construction.py @@ -0,0 +1,376 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm +from pandas.arrays import BooleanArray +from pandas.core.arrays.boolean import coerce_to_array + + +@pytest.fixture +def data(): + return pd.array( + [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False], + dtype="boolean", + ) + + +def test_boolean_array_constructor(): + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + + result = BooleanArray(values, mask) + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + with pytest.raises(TypeError, match="values should be boolean numpy array"): + BooleanArray(values.tolist(), mask) + + with pytest.raises(TypeError, match="mask should be boolean numpy array"): + BooleanArray(values, mask.tolist()) + + with pytest.raises(TypeError, match="values should be boolean numpy array"): + BooleanArray(values.astype(int), mask) + + with pytest.raises(TypeError, match="mask should be boolean numpy array"): + BooleanArray(values, None) + + with pytest.raises(ValueError, match="values must be a 1D array"): + BooleanArray(values.reshape(1, -1), mask) + + with pytest.raises(ValueError, match="mask must be a 1D array"): + BooleanArray(values, mask.reshape(1, -1)) + + +def test_boolean_array_constructor_copy(): + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + + result = BooleanArray(values, mask) + assert result._data is values + assert result._mask is mask + + result = BooleanArray(values, mask, copy=True) + assert result._data is not values + assert result._mask is not mask + + +def test_to_boolean_array(): + expected = BooleanArray( + np.array([True, False, True]), np.array([False, False, False]) + ) + + result = pd.array([True, False, True], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([True, False, True]), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([True, False, True], dtype=object), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + expected = BooleanArray( + np.array([True, False, True]), np.array([False, False, True]) + ) + + result = pd.array([True, False, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([True, False, None], dtype=object), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_all_none(): + expected = BooleanArray(np.array([True, True, True]), np.array([True, True, True])) + + result = pd.array([None, None, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([None, None, None], dtype=object), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "a, b", + [ + ([True, False, None, np.nan, pd.NA], [True, False, None, None, None]), + ([True, np.nan], [True, None]), + ([True, pd.NA], [True, None]), + ([np.nan, np.nan], [None, None]), + (np.array([np.nan, np.nan], dtype=float), [None, None]), + ], +) +def test_to_boolean_array_missing_indicators(a, b): + result = pd.array(a, dtype="boolean") + expected = pd.array(b, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "values", + [ + ["foo", "bar"], + ["1", "2"], + # "foo", + [1, 2], + [1.0, 2.0], + pd.date_range("20130101", periods=2), + np.array(["foo"]), + np.array([1, 2]), + np.array([1.0, 2.0]), + [np.nan, {"a": 1}], + ], +) +def test_to_boolean_array_error(values): + # error in converting existing arrays to BooleanArray + msg = "Need to pass bool-like value" + with pytest.raises(TypeError, match=msg): + pd.array(values, dtype="boolean") + + +def test_to_boolean_array_from_integer_array(): + result = pd.array(np.array([1, 0, 1, 0]), dtype="boolean") + expected = pd.array([True, False, True, False], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + result = pd.array(np.array([1, 0, 1, None]), dtype="boolean") + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_from_float_array(): + result = pd.array(np.array([1.0, 0.0, 1.0, 0.0]), dtype="boolean") + expected = pd.array([True, False, True, False], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + result = pd.array(np.array([1.0, 0.0, 1.0, np.nan]), dtype="boolean") + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_integer_like(): + # integers of 0's and 1's + result = pd.array([1, 0, 1, 0], dtype="boolean") + expected = pd.array([True, False, True, False], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + result = pd.array([1, 0, 1, None], dtype="boolean") + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_coerce_to_array(): + # TODO this is currently not public API + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + result = BooleanArray(*coerce_to_array(values, mask=mask)) + expected = BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + assert result._data is values + assert result._mask is mask + result = BooleanArray(*coerce_to_array(values, mask=mask, copy=True)) + expected = BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + assert result._data is not values + assert result._mask is not mask + + # mixed missing from values and mask + values = [True, False, None, False] + mask = np.array([False, False, False, True], dtype="bool") + result = BooleanArray(*coerce_to_array(values, mask=mask)) + expected = BooleanArray( + np.array([True, False, True, True]), np.array([False, False, True, True]) + ) + tm.assert_extension_array_equal(result, expected) + result = BooleanArray(*coerce_to_array(np.array(values, dtype=object), mask=mask)) + tm.assert_extension_array_equal(result, expected) + result = BooleanArray(*coerce_to_array(values, mask=mask.tolist())) + tm.assert_extension_array_equal(result, expected) + + # raise errors for wrong dimension + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + + with pytest.raises(ValueError, match="values must be a 1D list-like"): + coerce_to_array(values.reshape(1, -1)) + + with pytest.raises(ValueError, match="mask must be a 1D list-like"): + coerce_to_array(values, mask=mask.reshape(1, -1)) + + +def test_coerce_to_array_from_boolean_array(): + # passing BooleanArray to coerce_to_array + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + arr = BooleanArray(values, mask) + result = BooleanArray(*coerce_to_array(arr)) + tm.assert_extension_array_equal(result, arr) + # no copy + assert result._data is arr._data + assert result._mask is arr._mask + + result = BooleanArray(*coerce_to_array(arr), copy=True) + tm.assert_extension_array_equal(result, arr) + assert result._data is not arr._data + assert result._mask is not arr._mask + + with pytest.raises(ValueError, match="cannot pass mask for BooleanArray input"): + coerce_to_array(arr, mask=mask) + + +def test_coerce_to_numpy_array(): + # with missing values -> object dtype + arr = pd.array([True, False, None], dtype="boolean") + result = np.array(arr) + expected = np.array([True, False, pd.NA], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + # also with no missing values -> object dtype + arr = pd.array([True, False, True], dtype="boolean") + result = np.array(arr) + expected = np.array([True, False, True], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + # force bool dtype + result = np.array(arr, dtype="bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + # with missing values will raise error + arr = pd.array([True, False, None], dtype="boolean") + msg = ( + "cannot convert to 'bool'-dtype NumPy array with missing values. " + "Specify an appropriate 'na_value' for this dtype." + ) + with pytest.raises(ValueError, match=msg): + np.array(arr, dtype="bool") + + +def test_to_boolean_array_from_strings(): + result = BooleanArray._from_sequence_of_strings( + np.array(["True", "False", np.nan], dtype=object) + ) + expected = BooleanArray( + np.array([True, False, False]), np.array([False, False, True]) + ) + + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_from_strings_invalid_string(): + with pytest.raises(ValueError, match="cannot be cast"): + BooleanArray._from_sequence_of_strings(["donkey"]) + + +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy(box): + con = pd.Series if box else pd.array + # default (with or without missing values) -> object dtype + arr = con([True, False, True], dtype="boolean") + result = arr.to_numpy() + expected = np.array([True, False, True], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy() + expected = np.array([True, False, pd.NA], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy(dtype="str") + expected = np.array([True, False, pd.NA], dtype=" can convert to bool, otherwise raises + arr = con([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype="bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + with pytest.raises(ValueError, match="cannot convert to 'bool'-dtype"): + result = arr.to_numpy(dtype="bool") + + # specify dtype and na_value + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy(dtype=object, na_value=None) + expected = np.array([True, False, None], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype=bool, na_value=False) + expected = np.array([True, False, False], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype="int64", na_value=-99) + expected = np.array([1, 0, -99], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype="float64", na_value=np.nan) + expected = np.array([1, 0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + # converting to int or float without specifying na_value raises + with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"): + arr.to_numpy(dtype="int64") + with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"): + arr.to_numpy(dtype="float64") + + +def test_to_numpy_copy(): + # to_numpy can be zero-copy if no missing values + arr = pd.array([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype=bool) + result[0] = False + tm.assert_extension_array_equal( + arr, pd.array([False, False, True], dtype="boolean") + ) + + arr = pd.array([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype=bool, copy=True) + result[0] = False + tm.assert_extension_array_equal(arr, pd.array([True, False, True], dtype="boolean")) + + +# FIXME: don't leave commented out +# TODO when BooleanArray coerces to object dtype numpy array, need to do conversion +# manually in the indexing code +# def test_indexing_boolean_mask(): +# arr = pd.array([1, 2, 3, 4], dtype="Int64") +# mask = pd.array([True, False, True, False], dtype="boolean") +# result = arr[mask] +# expected = pd.array([1, 3], dtype="Int64") +# tm.assert_extension_array_equal(result, expected) + +# # missing values -> error +# mask = pd.array([True, False, True, None], dtype="boolean") +# with pytest.raises(IndexError): +# result = arr[mask] + + +@td.skip_if_no("pyarrow", min_version="0.15.0") +def test_arrow_array(data): + # protocol added in 0.15.0 + import pyarrow as pa + + arr = pa.array(data) + + # TODO use to_numpy(na_value=None) here + data_object = np.array(data, dtype=object) + data_object[data.isna()] = None + expected = pa.array(data_object, type=pa.bool_(), from_pandas=True) + assert arr.equals(expected) + + +@td.skip_if_no("pyarrow", min_version="0.15.1.dev") +def test_arrow_roundtrip(): + # roundtrip possible from arrow 1.0.0 + import pyarrow as pa + + data = pd.array([True, False, None], dtype="boolean") + df = pd.DataFrame({"a": data}) + table = pa.table(df) + assert table.field("a").type == "bool" + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.BooleanDtype) + tm.assert_frame_equal(result, df) diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py new file mode 100644 index 0000000000000..c2987dc37b960 --- /dev/null +++ b/pandas/tests/arrays/boolean/test_function.py @@ -0,0 +1,107 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.fixture +def data(): + return pd.array( + [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False], + dtype="boolean", + ) + + +@pytest.mark.parametrize( + "ufunc", [np.add, np.logical_or, np.logical_and, np.logical_xor] +) +def test_ufuncs_binary(ufunc): + # two BooleanArrays + a = pd.array([True, False, None], dtype="boolean") + result = ufunc(a, a) + expected = pd.array(ufunc(a._data, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = ufunc(s, a) + expected = pd.Series(ufunc(a._data, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_series_equal(result, expected) + + # Boolean with numpy array + arr = np.array([True, True, False]) + result = ufunc(a, arr) + expected = pd.array(ufunc(a._data, arr), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + result = ufunc(arr, a) + expected = pd.array(ufunc(arr, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + # BooleanArray with scalar + result = ufunc(a, True) + expected = pd.array(ufunc(a._data, True), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + result = ufunc(True, a) + expected = pd.array(ufunc(True, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + # not handled types + with pytest.raises(TypeError): + ufunc(a, "test") + + +@pytest.mark.parametrize("ufunc", [np.logical_not]) +def test_ufuncs_unary(ufunc): + a = pd.array([True, False, None], dtype="boolean") + result = ufunc(a) + expected = pd.array(ufunc(a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = ufunc(s) + expected = pd.Series(ufunc(a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("values", [[True, False], [True, None]]) +def test_ufunc_reduce_raises(values): + a = pd.array(values, dtype="boolean") + with pytest.raises(NotImplementedError): + np.add.reduce(a) + + +def test_value_counts_na(): + arr = pd.array([True, False, pd.NA], dtype="boolean") + result = arr.value_counts(dropna=False) + expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([1, 1], index=[True, False], dtype="Int64") + tm.assert_series_equal(result, expected) + + +def test_diff(): + a = pd.array( + [True, True, False, False, True, None, True, None, False], dtype="boolean" + ) + result = pd.core.algorithms.diff(a, 1) + expected = pd.array( + [None, False, True, False, True, None, None, None, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = s.diff() + expected = pd.Series(expected) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/boolean/test_indexing.py b/pandas/tests/arrays/boolean/test_indexing.py new file mode 100644 index 0000000000000..6a7daea16963c --- /dev/null +++ b/pandas/tests/arrays/boolean/test_indexing.py @@ -0,0 +1,13 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize("na", [None, np.nan, pd.NA]) +def test_setitem_missing_values(na): + arr = pd.array([True, False, None], dtype="boolean") + expected = pd.array([True, None, None], dtype="boolean") + arr[1] = na + tm.assert_extension_array_equal(arr, expected) diff --git a/pandas/tests/arrays/boolean/test_logical.py b/pandas/tests/arrays/boolean/test_logical.py new file mode 100644 index 0000000000000..6cfe19e2fe3eb --- /dev/null +++ b/pandas/tests/arrays/boolean/test_logical.py @@ -0,0 +1,230 @@ +import operator + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.arrays import BooleanArray +from pandas.tests.extension.base import BaseOpsUtil + + +class TestLogicalOps(BaseOpsUtil): + def test_numpy_scalars_ok(self, all_logical_operators): + a = pd.array([True, False, None], dtype="boolean") + op = getattr(a, all_logical_operators) + + tm.assert_extension_array_equal(op(True), op(np.bool(True))) + tm.assert_extension_array_equal(op(False), op(np.bool(False))) + + def get_op_from_name(self, op_name): + short_opname = op_name.strip("_") + short_opname = short_opname if "xor" in short_opname else short_opname + "_" + try: + op = getattr(operator, short_opname) + except AttributeError: + # Assume it is the reverse operator + rop = getattr(operator, short_opname[1:]) + op = lambda x, y: rop(y, x) + + return op + + def test_empty_ok(self, all_logical_operators): + a = pd.array([], dtype="boolean") + op_name = all_logical_operators + result = getattr(a, op_name)(True) + tm.assert_extension_array_equal(a, result) + + result = getattr(a, op_name)(False) + tm.assert_extension_array_equal(a, result) + + # TODO: pd.NA + # result = getattr(a, op_name)(pd.NA) + # tm.assert_extension_array_equal(a, result) + + def test_logical_length_mismatch_raises(self, all_logical_operators): + op_name = all_logical_operators + a = pd.array([True, False, None], dtype="boolean") + msg = "Lengths must match to compare" + + with pytest.raises(ValueError, match=msg): + getattr(a, op_name)([True, False]) + + with pytest.raises(ValueError, match=msg): + getattr(a, op_name)(np.array([True, False])) + + with pytest.raises(ValueError, match=msg): + getattr(a, op_name)(pd.array([True, False], dtype="boolean")) + + def test_logical_nan_raises(self, all_logical_operators): + op_name = all_logical_operators + a = pd.array([True, False, None], dtype="boolean") + msg = "Got float instead" + + with pytest.raises(TypeError, match=msg): + getattr(a, op_name)(np.nan) + + @pytest.mark.parametrize("other", ["a", 1]) + def test_non_bool_or_na_other_raises(self, other, all_logical_operators): + a = pd.array([True, False], dtype="boolean") + with pytest.raises(TypeError, match=str(type(other).__name__)): + getattr(a, all_logical_operators)(other) + + def test_kleene_or(self): + # A clear test of behavior. + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + result = a | b + expected = pd.array( + [True, True, True, True, False, None, True, None, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + result = b | a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + + @pytest.mark.parametrize( + "other, expected", + [ + (pd.NA, [True, None, None]), + (True, [True, True, True]), + (np.bool_(True), [True, True, True]), + (False, [True, False, None]), + (np.bool_(False), [True, False, None]), + ], + ) + def test_kleene_or_scalar(self, other, expected): + # TODO: test True & False + a = pd.array([True, False, None], dtype="boolean") + result = a | other + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = other | a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + def test_kleene_and(self): + # A clear test of behavior. + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + result = a & b + expected = pd.array( + [True, False, None, False, False, False, None, False, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + result = b & a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + + @pytest.mark.parametrize( + "other, expected", + [ + (pd.NA, [None, False, None]), + (True, [True, False, None]), + (False, [False, False, False]), + (np.bool_(True), [True, False, None]), + (np.bool_(False), [False, False, False]), + ], + ) + def test_kleene_and_scalar(self, other, expected): + a = pd.array([True, False, None], dtype="boolean") + result = a & other + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = other & a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + def test_kleene_xor(self): + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + result = a ^ b + expected = pd.array( + [False, True, None, True, False, None, None, None, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + result = b ^ a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + + @pytest.mark.parametrize( + "other, expected", + [ + (pd.NA, [None, None, None]), + (True, [False, True, None]), + (np.bool_(True), [False, True, None]), + (np.bool_(False), [True, False, None]), + ], + ) + def test_kleene_xor_scalar(self, other, expected): + a = pd.array([True, False, None], dtype="boolean") + result = a ^ other + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = other ^ a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + @pytest.mark.parametrize( + "other", [True, False, pd.NA, [True, False, None] * 3], + ) + def test_no_masked_assumptions(self, other, all_logical_operators): + # The logical operations should not assume that masked values are False! + a = pd.arrays.BooleanArray( + np.array([True, True, True, False, False, False, True, False, True]), + np.array([False] * 6 + [True, True, True]), + ) + b = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + if isinstance(other, list): + other = pd.array(other, dtype="boolean") + + result = getattr(a, all_logical_operators)(other) + expected = getattr(b, all_logical_operators)(other) + tm.assert_extension_array_equal(result, expected) + + if isinstance(other, BooleanArray): + other._data[other._mask] = True + a._data[a._mask] = False + + result = getattr(a, all_logical_operators)(other) + expected = getattr(b, all_logical_operators)(other) + tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/boolean/test_ops.py b/pandas/tests/arrays/boolean/test_ops.py new file mode 100644 index 0000000000000..52f602258a049 --- /dev/null +++ b/pandas/tests/arrays/boolean/test_ops.py @@ -0,0 +1,20 @@ +import pandas as pd +import pandas._testing as tm + + +class TestUnaryOps: + def test_invert(self): + a = pd.array([True, False, None], dtype="boolean") + expected = pd.array([False, True, None], dtype="boolean") + tm.assert_extension_array_equal(~a, expected) + + expected = pd.Series(expected, index=["a", "b", "c"], name="name") + result = ~pd.Series(a, index=["a", "b", "c"], name="name") + tm.assert_series_equal(result, expected) + + df = pd.DataFrame({"A": a, "B": [True, False, False]}, index=["a", "b", "c"]) + result = ~df + expected = pd.DataFrame( + {"A": expected, "B": [False, True, True]}, index=["a", "b", "c"] + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/arrays/boolean/test_reduction.py b/pandas/tests/arrays/boolean/test_reduction.py new file mode 100644 index 0000000000000..a5c18a25f8e16 --- /dev/null +++ b/pandas/tests/arrays/boolean/test_reduction.py @@ -0,0 +1,60 @@ +import numpy as np +import pytest + +import pandas as pd + + +@pytest.fixture +def data(): + return pd.array( + [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False], + dtype="boolean", + ) + + +@pytest.mark.parametrize( + "values, exp_any, exp_all, exp_any_noskip, exp_all_noskip", + [ + ([True, pd.NA], True, True, True, pd.NA), + ([False, pd.NA], False, False, pd.NA, False), + ([pd.NA], False, True, pd.NA, pd.NA), + ([], False, True, False, True), + # GH-33253: all True / all False values buggy with skipna=False + ([True, True], True, True, True, True), + ([False, False], False, False, False, False), + ], +) +def test_any_all(values, exp_any, exp_all, exp_any_noskip, exp_all_noskip): + # the methods return numpy scalars + exp_any = pd.NA if exp_any is pd.NA else np.bool_(exp_any) + exp_all = pd.NA if exp_all is pd.NA else np.bool_(exp_all) + exp_any_noskip = pd.NA if exp_any_noskip is pd.NA else np.bool_(exp_any_noskip) + exp_all_noskip = pd.NA if exp_all_noskip is pd.NA else np.bool_(exp_all_noskip) + + for con in [pd.array, pd.Series]: + a = con(values, dtype="boolean") + assert a.any() is exp_any + assert a.all() is exp_all + assert a.any(skipna=False) is exp_any_noskip + assert a.all(skipna=False) is exp_all_noskip + + assert np.any(a.any()) is exp_any + assert np.all(a.all()) is exp_all + + +@pytest.mark.parametrize("dropna", [True, False]) +def test_reductions_return_types(dropna, data, all_numeric_reductions): + op = all_numeric_reductions + s = pd.Series(data) + if dropna: + s = s.dropna() + + if op == "sum": + assert isinstance(getattr(s, op)(), np.int_) + elif op == "prod": + assert isinstance(getattr(s, op)(), np.int_) + elif op in ("min", "max"): + assert isinstance(getattr(s, op)(), np.bool_) + else: + # "mean", "std", "var", "median", "kurt", "skew" + assert isinstance(getattr(s, op)(), np.float64) diff --git a/pandas/tests/arrays/boolean/test_repr.py b/pandas/tests/arrays/boolean/test_repr.py new file mode 100644 index 0000000000000..0ee904b18cc9e --- /dev/null +++ b/pandas/tests/arrays/boolean/test_repr.py @@ -0,0 +1,13 @@ +import pandas as pd + + +def test_repr(): + df = pd.DataFrame({"A": pd.array([True, False, None], dtype="boolean")}) + expected = " A\n0 True\n1 False\n2 " + assert repr(df) == expected + + expected = "0 True\n1 False\n2 \nName: A, dtype: boolean" + assert repr(df.A) == expected + + expected = "\n[True, False, ]\nLength: 3, dtype: boolean" + assert repr(df.A.array) == expected diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py index 835aa87a7c21b..325fa476d70e6 100644 --- a/pandas/tests/arrays/categorical/test_algos.py +++ b/pandas/tests/arrays/categorical/test_algos.py @@ -64,6 +64,8 @@ def test_isin_cats(): [ ("b", "c", ["a", "c"], "Categorical.categories are different"), ("c", "d", ["a", "b"], None), + # https://github.com/pandas-dev/pandas/issues/33288 + ("a", "a", ["a", "b"], None), ("b", None, ["a", None], "Categorical.categories length are different"), ], ) @@ -140,23 +142,21 @@ def test_take_empty(self, allow_fill): with pytest.raises(IndexError, match=msg): cat.take([0], allow_fill=allow_fill) - def test_positional_take(self, ordered_fixture): + def test_positional_take(self, ordered): cat = pd.Categorical( - ["a", "a", "b", "b"], categories=["b", "a"], ordered=ordered_fixture + ["a", "a", "b", "b"], categories=["b", "a"], ordered=ordered ) result = cat.take([0, 1, 2], allow_fill=False) expected = pd.Categorical( - ["a", "a", "b"], categories=cat.categories, ordered=ordered_fixture + ["a", "a", "b"], categories=cat.categories, ordered=ordered ) tm.assert_categorical_equal(result, expected) - def test_positional_take_unobserved(self, ordered_fixture): - cat = pd.Categorical( - ["a", "b"], categories=["a", "b", "c"], ordered=ordered_fixture - ) + def test_positional_take_unobserved(self, ordered): + cat = pd.Categorical(["a", "b"], categories=["a", "b", "c"], ordered=ordered) result = cat.take([1, 0], allow_fill=False) expected = pd.Categorical( - ["b", "a"], categories=cat.categories, ordered=ordered_fixture + ["b", "a"], categories=cat.categories, ordered=ordered ) tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 0ff7d3e59abb3..c470f677b5386 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -114,14 +114,14 @@ def test_mode(self, values, categories, exp_mode): exp = Categorical(exp_mode, categories=categories, ordered=True) tm.assert_categorical_equal(res, exp) - def test_searchsorted(self, ordered_fixture): + def test_searchsorted(self, ordered): # https://github.com/pandas-dev/pandas/issues/8420 # https://github.com/pandas-dev/pandas/issues/14522 cat = Categorical( ["cheese", "milk", "apple", "bread", "bread"], categories=["cheese", "milk", "apple", "bread"], - ordered=ordered_fixture, + ordered=ordered, ) ser = Series(cat) diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index f49f70f5acf77..691230620c2e8 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -5,7 +5,7 @@ from pandas import Categorical, CategoricalIndex, DataFrame, Index, Series import pandas._testing as tm -from pandas.core.arrays.categorical import _recode_for_categories +from pandas.core.arrays.categorical import recode_for_categories from pandas.tests.arrays.categorical.common import TestCategorical @@ -247,7 +247,7 @@ def test_set_categories(self): tm.assert_index_equal(c.categories, Index([1, 2, 3, 4])) exp = np.array([1, 2, 3, 4, 1], dtype=np.int64) - tm.assert_numpy_array_equal(c.to_dense(), exp) + tm.assert_numpy_array_equal(np.asarray(c), exp) # all "pointers" to '4' must be changed from 3 to 0,... c = c.set_categories([4, 3, 2, 1]) @@ -260,7 +260,7 @@ def test_set_categories(self): # output is the same exp = np.array([1, 2, 3, 4, 1], dtype=np.int64) - tm.assert_numpy_array_equal(c.to_dense(), exp) + tm.assert_numpy_array_equal(np.asarray(c), exp) assert c.min() == 4 assert c.max() == 1 @@ -268,13 +268,19 @@ def test_set_categories(self): c2 = c.set_categories([4, 3, 2, 1], ordered=False) assert not c2.ordered - tm.assert_numpy_array_equal(c.to_dense(), c2.to_dense()) + tm.assert_numpy_array_equal(np.asarray(c), np.asarray(c2)) # set_categories should pass thru the ordering c2 = c.set_ordered(False).set_categories([4, 3, 2, 1]) assert not c2.ordered - tm.assert_numpy_array_equal(c.to_dense(), c2.to_dense()) + tm.assert_numpy_array_equal(np.asarray(c), np.asarray(c2)) + + def test_to_dense_deprecated(self): + cat = Categorical(["a", "b", "c", "a"], ordered=True) + + with tm.assert_produces_warning(FutureWarning): + cat.to_dense() @pytest.mark.parametrize( "values, categories, new_categories", @@ -498,7 +504,7 @@ def test_recode_to_categories(self, codes, old, new, expected): expected = np.asanyarray(expected, dtype=np.int8) old = Index(old) new = Index(new) - result = _recode_for_categories(codes, old, new) + result = recode_for_categories(codes, old, new) tm.assert_numpy_array_equal(result, expected) def test_recode_to_categories_large(self): @@ -507,5 +513,5 @@ def test_recode_to_categories_large(self): old = Index(codes) expected = np.arange(N - 1, -1, -1, dtype=np.int16) new = Index(expected) - result = _recode_for_categories(codes, old, new) + result = recode_for_categories(codes, old, new) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index c6b4c4904735c..3e31c1acbe09d 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -252,7 +252,7 @@ def test_constructor_not_sequence(self): def test_constructor_with_null(self): # Cannot have NaN in categories - msg = "Categorial categories cannot be null" + msg = "Categorical categories cannot be null" with pytest.raises(ValueError, match=msg): Categorical([np.nan, "a", "b", "c"], categories=[np.nan, "a", "b", "c"]) @@ -500,7 +500,7 @@ def test_from_codes_non_unique_categories(self): Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"]) def test_from_codes_nan_cat_included(self): - with pytest.raises(ValueError, match="Categorial categories cannot be null"): + with pytest.raises(ValueError, match="Categorical categories cannot be null"): Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan]) def test_from_codes_too_negative(self): diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 3d9469c252914..abfae189bb4d7 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -85,6 +85,15 @@ def test_setitem_same_ordered_rasies(self, other): class TestCategoricalIndexing: + def test_getitem_slice(self): + cat = Categorical(["a", "b", "c", "d", "a", "b", "c"]) + sliced = cat[3] + assert sliced == "d" + + sliced = cat[3:5] + expected = Categorical(["d", "a"], categories=["a", "b", "c", "d"]) + tm.assert_categorical_equal(sliced, expected) + def test_getitem_listlike(self): # GH 9469 @@ -146,7 +155,7 @@ def test_periodindex(self): tm.assert_numpy_array_equal(cat3._codes, exp_arr) tm.assert_index_equal(cat3.categories, exp_idx) - def test_categories_assigments(self): + def test_categories_assignments(self): s = Categorical(["a", "b", "c", "a"]) exp = np.array([1, 2, 3, 1], dtype=np.int64) s.categories = [1, 2, 3] @@ -154,7 +163,7 @@ def test_categories_assigments(self): tm.assert_index_equal(s.categories, Index([1, 2, 3])) @pytest.mark.parametrize("new_categories", [[1, 2, 3, 4], [1, 2]]) - def test_categories_assigments_wrong_length_raises(self, new_categories): + def test_categories_assignments_wrong_length_raises(self, new_categories): cat = Categorical(["a", "b", "c", "a"]) msg = ( "new categories need to have the same number of items " diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 8889f45a84237..9eb3c8b3a8c48 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -82,3 +82,18 @@ def test_fillna_iterable_category(self, named): expected = Categorical([Point(0, 0), Point(0, 1), Point(0, 0)]) tm.assert_categorical_equal(result, expected) + + def test_fillna_array(self): + # accept Categorical or ndarray value if it holds appropriate values + cat = Categorical(["A", "B", "C", None, None]) + + other = cat.fillna("C") + result = cat.fillna(other) + tm.assert_categorical_equal(result, other) + assert isna(cat[-1]) # didnt modify original inplace + + other = np.array(["A", "B", "C", "B", "A"]) + result = cat.fillna(other) + expected = Categorical(["A", "B", "C", "B", "A"], dtype=cat.dtype) + tm.assert_categorical_equal(result, expected) + assert isna(cat[-1]) # didnt modify original inplace diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py index 52530123bd52f..b9ac3ce9a37ae 100644 --- a/pandas/tests/arrays/categorical/test_replace.py +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -1,3 +1,4 @@ +import numpy as np import pytest import pandas as pd @@ -5,44 +6,46 @@ @pytest.mark.parametrize( - "to_replace,value,expected,check_types,check_categorical", + "to_replace,value,expected,flip_categories", [ # one-to-one - (1, 2, [2, 2, 3], True, True), - (1, 4, [4, 2, 3], True, True), - (4, 1, [1, 2, 3], True, True), - (5, 6, [1, 2, 3], True, True), + (1, 2, [2, 2, 3], False), + (1, 4, [4, 2, 3], False), + (4, 1, [1, 2, 3], False), + (5, 6, [1, 2, 3], False), # many-to-one - ([1], 2, [2, 2, 3], True, True), - ([1, 2], 3, [3, 3, 3], True, True), - ([1, 2], 4, [4, 4, 3], True, True), - ((1, 2, 4), 5, [5, 5, 3], True, True), - ((5, 6), 2, [1, 2, 3], True, True), + ([1], 2, [2, 2, 3], False), + ([1, 2], 3, [3, 3, 3], False), + ([1, 2], 4, [4, 4, 3], False), + ((1, 2, 4), 5, [5, 5, 3], False), + ((5, 6), 2, [1, 2, 3], False), # many-to-many, handled outside of Categorical and results in separate dtype - ([1], [2], [2, 2, 3], False, False), - ([1, 4], [5, 2], [5, 2, 3], False, False), + ([1], [2], [2, 2, 3], True), + ([1, 4], [5, 2], [5, 2, 3], True), # check_categorical sorts categories, which crashes on mixed dtypes - (3, "4", [1, 2, "4"], True, False), - ([1, 2, "3"], "5", ["5", "5", 3], True, False), + (3, "4", [1, 2, "4"], False), + ([1, 2, "3"], "5", ["5", "5", 3], True), ], ) -def test_replace(to_replace, value, expected, check_types, check_categorical): +def test_replace(to_replace, value, expected, flip_categories): # GH 31720 + stays_categorical = not isinstance(value, list) + s = pd.Series([1, 2, 3], dtype="category") result = s.replace(to_replace, value) expected = pd.Series(expected, dtype="category") s.replace(to_replace, value, inplace=True) + + if flip_categories: + expected = expected.cat.set_categories(expected.cat.categories[::-1]) + + if not stays_categorical: + # the replace call loses categorical dtype + expected = pd.Series(np.asarray(expected)) + tm.assert_series_equal( - expected, - result, - check_dtype=check_types, - check_categorical=check_categorical, - check_category_order=False, + expected, result, check_category_order=False, ) tm.assert_series_equal( - expected, - s, - check_dtype=check_types, - check_categorical=check_categorical, - check_category_order=False, + expected, s, check_category_order=False, ) diff --git a/pandas/tests/arrays/integer/__init__.py b/pandas/tests/arrays/integer/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/arrays/integer/conftest.py b/pandas/tests/arrays/integer/conftest.py new file mode 100644 index 0000000000000..994fccf837f08 --- /dev/null +++ b/pandas/tests/arrays/integer/conftest.py @@ -0,0 +1,52 @@ +import numpy as np +import pytest + +from pandas.core.arrays import integer_array +from pandas.core.arrays.integer import ( + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, +) + + +@pytest.fixture( + params=[ + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + ] +) +def dtype(request): + return request.param() + + +@pytest.fixture +def data(dtype): + return integer_array( + list(range(8)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100], + dtype=dtype, + ) + + +@pytest.fixture +def data_missing(dtype): + return integer_array([np.nan, 1], dtype=dtype) + + +@pytest.fixture(params=["data", "data_missing"]) +def all_data(request, data, data_missing): + """Parametrized fixture giving 'data' and 'data_missing'""" + if request.param == "data": + return data + elif request.param == "data_missing": + return data_missing diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py new file mode 100644 index 0000000000000..18f1dac3c13b2 --- /dev/null +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -0,0 +1,348 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.api.types import is_float, is_float_dtype, is_scalar +from pandas.core.arrays import IntegerArray, integer_array +from pandas.tests.extension.base import BaseOpsUtil + + +class TestArithmeticOps(BaseOpsUtil): + def _check_divmod_op(self, s, op, other, exc=None): + super()._check_divmod_op(s, op, other, None) + + def _check_op(self, s, op_name, other, exc=None): + op = self.get_op_from_name(op_name) + result = op(s, other) + + # compute expected + mask = s.isna() + + # if s is a DataFrame, squeeze to a Series + # for comparison + if isinstance(s, pd.DataFrame): + result = result.squeeze() + s = s.squeeze() + mask = mask.squeeze() + + # other array is an Integer + if isinstance(other, IntegerArray): + omask = getattr(other, "mask", None) + mask = getattr(other, "data", other) + if omask is not None: + mask |= omask + + # 1 ** na is na, so need to unmask those + if op_name == "__pow__": + mask = np.where(~s.isna() & (s == 1), False, mask) + + elif op_name == "__rpow__": + other_is_one = other == 1 + if isinstance(other_is_one, pd.Series): + other_is_one = other_is_one.fillna(False) + mask = np.where(other_is_one, False, mask) + + # float result type or float op + if ( + is_float_dtype(other) + or is_float(other) + or op_name in ["__rtruediv__", "__truediv__", "__rdiv__", "__div__"] + ): + rs = s.astype("float") + expected = op(rs, other) + self._check_op_float(result, expected, mask, s, op_name, other) + + # integer result type + else: + rs = pd.Series(s.values._data, name=s.name) + expected = op(rs, other) + self._check_op_integer(result, expected, mask, s, op_name, other) + + def _check_op_float(self, result, expected, mask, s, op_name, other): + # check comparisons that are resulting in float dtypes + + expected[mask] = np.nan + if "floordiv" in op_name: + # Series op sets 1//0 to np.inf, which IntegerArray does not do (yet) + mask2 = np.isinf(expected) & np.isnan(result) + expected[mask2] = np.nan + tm.assert_series_equal(result, expected) + + def _check_op_integer(self, result, expected, mask, s, op_name, other): + # check comparisons that are resulting in integer dtypes + + # to compare properly, we convert the expected + # to float, mask to nans and convert infs + # if we have uints then we process as uints + # then convert to float + # and we ultimately want to create a IntArray + # for comparisons + + fill_value = 0 + + # mod/rmod turn floating 0 into NaN while + # integer works as expected (no nan) + if op_name in ["__mod__", "__rmod__"]: + if is_scalar(other): + if other == 0: + expected[s.values == 0] = 0 + else: + expected = expected.fillna(0) + else: + expected[ + (s.values == 0).fillna(False) + & ((expected == 0).fillna(False) | expected.isna()) + ] = 0 + try: + expected[ + ((expected == np.inf) | (expected == -np.inf)).fillna(False) + ] = fill_value + original = expected + expected = expected.astype(s.dtype) + + except ValueError: + + expected = expected.astype(float) + expected[ + ((expected == np.inf) | (expected == -np.inf)).fillna(False) + ] = fill_value + original = expected + expected = expected.astype(s.dtype) + + expected[mask] = pd.NA + + # assert that the expected astype is ok + # (skip for unsigned as they have wrap around) + if not s.dtype.is_unsigned_integer: + original = pd.Series(original) + + # we need to fill with 0's to emulate what an astype('int') does + # (truncation) for certain ops + if op_name in ["__rtruediv__", "__rdiv__"]: + mask |= original.isna() + original = original.fillna(0).astype("int") + + original = original.astype("float") + original[mask] = np.nan + tm.assert_series_equal(original, expected.astype("float")) + + # assert our expected result + tm.assert_series_equal(result, expected) + + def test_arith_integer_array(self, data, all_arithmetic_operators): + # we operate with a rhs of an integer array + + op = all_arithmetic_operators + + s = pd.Series(data) + rhs = pd.Series([1] * len(data), dtype=data.dtype) + rhs.iloc[-1] = np.nan + + self._check_op(s, op, rhs) + + def test_arith_series_with_scalar(self, data, all_arithmetic_operators): + # scalar + op = all_arithmetic_operators + s = pd.Series(data) + self._check_op(s, op, 1, exc=TypeError) + + def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): + # frame & scalar + op = all_arithmetic_operators + df = pd.DataFrame({"A": data}) + self._check_op(df, op, 1, exc=TypeError) + + def test_arith_series_with_array(self, data, all_arithmetic_operators): + # ndarray & other series + op = all_arithmetic_operators + s = pd.Series(data) + other = np.ones(len(s), dtype=s.dtype.type) + self._check_op(s, op, other, exc=TypeError) + + def test_arith_coerce_scalar(self, data, all_arithmetic_operators): + + op = all_arithmetic_operators + s = pd.Series(data) + + other = 0.01 + self._check_op(s, op, other) + + @pytest.mark.parametrize("other", [1.0, np.array(1.0)]) + def test_arithmetic_conversion(self, all_arithmetic_operators, other): + # if we have a float operand we should have a float result + # if that is equal to an integer + op = self.get_op_from_name(all_arithmetic_operators) + + s = pd.Series([1, 2, 3], dtype="Int64") + result = op(s, other) + assert result.dtype is np.dtype("float") + + def test_arith_len_mismatch(self, all_arithmetic_operators): + # operating with a list-like with non-matching length raises + op = self.get_op_from_name(all_arithmetic_operators) + other = np.array([1.0]) + + s = pd.Series([1, 2, 3], dtype="Int64") + with pytest.raises(ValueError, match="Lengths must match"): + op(s, other) + + @pytest.mark.parametrize("other", [0, 0.5]) + def test_arith_zero_dim_ndarray(self, other): + arr = integer_array([1, None, 2]) + result = arr + np.array(other) + expected = arr + other + tm.assert_equal(result, expected) + + def test_error(self, data, all_arithmetic_operators): + # invalid ops + + op = all_arithmetic_operators + s = pd.Series(data) + ops = getattr(s, op) + opa = getattr(data, op) + + # invalid scalars + msg = ( + r"(:?can only perform ops with numeric values)" + r"|(:?IntegerArray cannot perform the operation mod)" + ) + with pytest.raises(TypeError, match=msg): + ops("foo") + with pytest.raises(TypeError, match=msg): + ops(pd.Timestamp("20180101")) + + # invalid array-likes + with pytest.raises(TypeError, match=msg): + ops(pd.Series("foo", index=s.index)) + + if op != "__rpow__": + # TODO(extension) + # rpow with a datetimelike coerces the integer array incorrectly + msg = ( + "can only perform ops with numeric values|" + "cannot perform .* with this index type: DatetimeArray|" + "Addition/subtraction of integers and integer-arrays " + "with DatetimeArray is no longer supported. *" + ) + with pytest.raises(TypeError, match=msg): + ops(pd.Series(pd.date_range("20180101", periods=len(s)))) + + # 2d + result = opa(pd.DataFrame({"A": s})) + assert result is NotImplemented + + msg = r"can only perform ops with 1-d structures" + with pytest.raises(NotImplementedError, match=msg): + opa(np.arange(len(s)).reshape(-1, len(s))) + + @pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)]) + def test_divide_by_zero(self, zero, negative): + # https://github.com/pandas-dev/pandas/issues/27398 + a = pd.array([0, 1, -1, None], dtype="Int64") + result = a / zero + expected = np.array([np.nan, np.inf, -np.inf, np.nan]) + if negative: + expected *= -1 + tm.assert_numpy_array_equal(result, expected) + + def test_pow_scalar(self): + a = pd.array([-1, 0, 1, None, 2], dtype="Int64") + result = a ** 0 + expected = pd.array([1, 1, 1, 1, 1], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = a ** 1 + expected = pd.array([-1, 0, 1, None, 2], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = a ** pd.NA + expected = pd.array([None, None, 1, None, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = a ** np.nan + expected = np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + # reversed + a = a[1:] # Can't raise integers to negative powers. + + result = 0 ** a + expected = pd.array([1, 0, None, 0], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = 1 ** a + expected = pd.array([1, 1, 1, 1], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = pd.NA ** a + expected = pd.array([1, None, None, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = np.nan ** a + expected = np.array([1, np.nan, np.nan, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + def test_pow_array(self): + a = integer_array([0, 0, 0, 1, 1, 1, None, None, None]) + b = integer_array([0, 1, None, 0, 1, None, 0, 1, None]) + result = a ** b + expected = integer_array([1, 0, None, 1, 1, 1, 1, None, None]) + tm.assert_extension_array_equal(result, expected) + + def test_rpow_one_to_na(self): + # https://github.com/pandas-dev/pandas/issues/22022 + # https://github.com/pandas-dev/pandas/issues/29997 + arr = integer_array([np.nan, np.nan]) + result = np.array([1.0, 2.0]) ** arr + expected = np.array([1.0, np.nan]) + tm.assert_numpy_array_equal(result, expected) + + +def test_cross_type_arithmetic(): + + df = pd.DataFrame( + { + "A": pd.Series([1, 2, np.nan], dtype="Int64"), + "B": pd.Series([1, np.nan, 3], dtype="UInt8"), + "C": [1, 2, 3], + } + ) + + result = df.A + df.C + expected = pd.Series([2, 4, np.nan], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = (df.A + df.C) * 3 == 12 + expected = pd.Series([False, True, None], dtype="boolean") + tm.assert_series_equal(result, expected) + + result = df.A + df.B + expected = pd.Series([2, np.nan, np.nan], dtype="Int64") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("op", ["mean"]) +def test_reduce_to_float(op): + # some reduce ops always return float, even if the result + # is a rounded number + df = pd.DataFrame( + { + "A": ["a", "b", "b"], + "B": [1, None, 3], + "C": integer_array([1, None, 3], dtype="Int64"), + } + ) + + # op + result = getattr(df.C, op)() + assert isinstance(result, float) + + # groupby + result = getattr(df.groupby("A"), op)() + + expected = pd.DataFrame( + {"B": np.array([1.0, 3.0]), "C": integer_array([1, 3], dtype="Int64")}, + index=pd.Index(["a", "b"], name="A"), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/arrays/integer/test_comparison.py b/pandas/tests/arrays/integer/test_comparison.py new file mode 100644 index 0000000000000..d76ed2c21ca0e --- /dev/null +++ b/pandas/tests/arrays/integer/test_comparison.py @@ -0,0 +1,106 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.tests.extension.base import BaseOpsUtil + + +class TestComparisonOps(BaseOpsUtil): + def _compare_other(self, data, op_name, other): + op = self.get_op_from_name(op_name) + + # array + result = pd.Series(op(data, other)) + expected = pd.Series(op(data._data, other), dtype="boolean") + + # fill the nan locations + expected[data._mask] = pd.NA + + tm.assert_series_equal(result, expected) + + # series + s = pd.Series(data) + result = op(s, other) + + expected = op(pd.Series(data._data), other) + + # fill the nan locations + expected[data._mask] = pd.NA + expected = expected.astype("boolean") + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("other", [True, False, pd.NA, -1, 0, 1]) + def test_scalar(self, other, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([1, 0, None], dtype="Int64") + + result = op(a, other) + + if other is pd.NA: + expected = pd.array([None, None, None], dtype="boolean") + else: + values = op(a._data, other) + expected = pd.arrays.BooleanArray(values, a._mask, copy=True) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = pd.NA + tm.assert_extension_array_equal(a, pd.array([1, 0, None], dtype="Int64")) + + def test_array(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([0, 1, 2, None, None, None], dtype="Int64") + b = pd.array([0, 1, None, 0, 1, None], dtype="Int64") + + result = op(a, b) + values = op(a._data, b._data) + mask = a._mask | b._mask + + expected = pd.arrays.BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = pd.NA + tm.assert_extension_array_equal( + a, pd.array([0, 1, 2, None, None, None], dtype="Int64") + ) + tm.assert_extension_array_equal( + b, pd.array([0, 1, None, 0, 1, None], dtype="Int64") + ) + + def test_compare_with_booleanarray(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([True, False, None] * 3, dtype="boolean") + b = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype="Int64") + other = pd.array([False] * 3 + [True] * 3 + [None] * 3, dtype="boolean") + expected = op(a, other) + result = op(a, b) + tm.assert_extension_array_equal(result, expected) + + def test_no_shared_mask(self, data): + result = data + 1 + assert np.shares_memory(result._mask, data._mask) is False + + def test_compare_to_string(self, any_nullable_int_dtype): + # GH 28930 + s = pd.Series([1, None], dtype=any_nullable_int_dtype) + result = s == "a" + expected = pd.Series([False, pd.NA], dtype="boolean") + + self.assert_series_equal(result, expected) + + def test_compare_to_int(self, any_nullable_int_dtype, all_compare_operators): + # GH 28930 + s1 = pd.Series([1, None, 3], dtype=any_nullable_int_dtype) + s2 = pd.Series([1, None, 3], dtype="float") + + method = getattr(s1, all_compare_operators) + result = method(2) + + method = getattr(s2, all_compare_operators) + expected = method(2).astype("boolean") + expected[s2.isna()] = pd.NA + + self.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py new file mode 100644 index 0000000000000..43936d8b95bd6 --- /dev/null +++ b/pandas/tests/arrays/integer/test_construction.py @@ -0,0 +1,238 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm +from pandas.api.types import is_integer +from pandas.core.arrays import IntegerArray, integer_array +from pandas.core.arrays.integer import Int8Dtype, Int32Dtype, Int64Dtype + + +def test_uses_pandas_na(): + a = pd.array([1, None], dtype=pd.Int64Dtype()) + assert a[1] is pd.NA + + +def test_from_dtype_from_float(data): + # construct from our dtype & string dtype + dtype = data.dtype + + # from float + expected = pd.Series(data) + result = pd.Series(data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype)) + tm.assert_series_equal(result, expected) + + # from int / list + expected = pd.Series(data) + result = pd.Series(np.array(data).tolist(), dtype=str(dtype)) + tm.assert_series_equal(result, expected) + + # from int / array + expected = pd.Series(data).dropna().reset_index(drop=True) + dropped = np.array(data.dropna()).astype(np.dtype((dtype.type))) + result = pd.Series(dropped, dtype=str(dtype)) + tm.assert_series_equal(result, expected) + + +def test_conversions(data_missing): + + # astype to object series + df = pd.DataFrame({"A": data_missing}) + result = df["A"].astype("object") + expected = pd.Series(np.array([np.nan, 1], dtype=object), name="A") + tm.assert_series_equal(result, expected) + + # convert to object ndarray + # we assert that we are exactly equal + # including type conversions of scalars + result = df["A"].astype("object").values + expected = np.array([pd.NA, 1], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + for r, e in zip(result, expected): + if pd.isnull(r): + assert pd.isnull(e) + elif is_integer(r): + assert r == e + assert is_integer(e) + else: + assert r == e + assert type(r) == type(e) + + +def test_integer_array_constructor(): + values = np.array([1, 2, 3, 4], dtype="int64") + mask = np.array([False, False, False, True], dtype="bool") + + result = IntegerArray(values, mask) + expected = integer_array([1, 2, 3, np.nan], dtype="int64") + tm.assert_extension_array_equal(result, expected) + + msg = r".* should be .* numpy array. Use the 'pd.array' function instead" + with pytest.raises(TypeError, match=msg): + IntegerArray(values.tolist(), mask) + + with pytest.raises(TypeError, match=msg): + IntegerArray(values, mask.tolist()) + + with pytest.raises(TypeError, match=msg): + IntegerArray(values.astype(float), mask) + msg = r"__init__\(\) missing 1 required positional argument: 'mask'" + with pytest.raises(TypeError, match=msg): + IntegerArray(values) + + +@pytest.mark.parametrize( + "a, b", + [ + ([1, None], [1, np.nan]), + ([None], [np.nan]), + ([None, np.nan], [np.nan, np.nan]), + ([np.nan, np.nan], [np.nan, np.nan]), + ], +) +def test_integer_array_constructor_none_is_nan(a, b): + result = integer_array(a) + expected = integer_array(b) + tm.assert_extension_array_equal(result, expected) + + +def test_integer_array_constructor_copy(): + values = np.array([1, 2, 3, 4], dtype="int64") + mask = np.array([False, False, False, True], dtype="bool") + + result = IntegerArray(values, mask) + assert result._data is values + assert result._mask is mask + + result = IntegerArray(values, mask, copy=True) + assert result._data is not values + assert result._mask is not mask + + +@pytest.mark.parametrize( + "values", + [ + ["foo", "bar"], + ["1", "2"], + "foo", + 1, + 1.0, + pd.date_range("20130101", periods=2), + np.array(["foo"]), + [[1, 2], [3, 4]], + [np.nan, {"a": 1}], + ], +) +def test_to_integer_array_error(values): + # error in converting existing arrays to IntegerArrays + msg = ( + r"(:?.* cannot be converted to an IntegerDtype)" + r"|(:?values must be a 1D list-like)" + ) + with pytest.raises(TypeError, match=msg): + integer_array(values) + + +def test_to_integer_array_inferred_dtype(): + # if values has dtype -> respect it + result = integer_array(np.array([1, 2], dtype="int8")) + assert result.dtype == Int8Dtype() + result = integer_array(np.array([1, 2], dtype="int32")) + assert result.dtype == Int32Dtype() + + # if values have no dtype -> always int64 + result = integer_array([1, 2]) + assert result.dtype == Int64Dtype() + + +def test_to_integer_array_dtype_keyword(): + result = integer_array([1, 2], dtype="int8") + assert result.dtype == Int8Dtype() + + # if values has dtype -> override it + result = integer_array(np.array([1, 2], dtype="int8"), dtype="int32") + assert result.dtype == Int32Dtype() + + +def test_to_integer_array_float(): + result = integer_array([1.0, 2.0]) + expected = integer_array([1, 2]) + tm.assert_extension_array_equal(result, expected) + + with pytest.raises(TypeError, match="cannot safely cast non-equivalent"): + integer_array([1.5, 2.0]) + + # for float dtypes, the itemsize is not preserved + result = integer_array(np.array([1.0, 2.0], dtype="float32")) + assert result.dtype == Int64Dtype() + + +@pytest.mark.parametrize( + "bool_values, int_values, target_dtype, expected_dtype", + [ + ([False, True], [0, 1], Int64Dtype(), Int64Dtype()), + ([False, True], [0, 1], "Int64", Int64Dtype()), + ([False, True, np.nan], [0, 1, np.nan], Int64Dtype(), Int64Dtype()), + ], +) +def test_to_integer_array_bool(bool_values, int_values, target_dtype, expected_dtype): + result = integer_array(bool_values, dtype=target_dtype) + assert result.dtype == expected_dtype + expected = integer_array(int_values, dtype=target_dtype) + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "values, to_dtype, result_dtype", + [ + (np.array([1], dtype="int64"), None, Int64Dtype), + (np.array([1, np.nan]), None, Int64Dtype), + (np.array([1, np.nan]), "int8", Int8Dtype), + ], +) +def test_to_integer_array(values, to_dtype, result_dtype): + # convert existing arrays to IntegerArrays + result = integer_array(values, dtype=to_dtype) + assert result.dtype == result_dtype() + expected = integer_array(values, dtype=result_dtype()) + tm.assert_extension_array_equal(result, expected) + + +@td.skip_if_no("pyarrow", min_version="0.15.0") +def test_arrow_array(data): + # protocol added in 0.15.0 + import pyarrow as pa + + arr = pa.array(data) + expected = np.array(data, dtype=object) + expected[data.isna()] = None + expected = pa.array(expected, type=data.dtype.name.lower(), from_pandas=True) + assert arr.equals(expected) + + +@td.skip_if_no("pyarrow", min_version="0.16.0") +def test_arrow_roundtrip(data): + # roundtrip possible from arrow 0.16.0 + import pyarrow as pa + + df = pd.DataFrame({"a": data}) + table = pa.table(df) + assert table.field("a").type == str(data.dtype.numpy_dtype) + result = table.to_pandas() + tm.assert_frame_equal(result, df) + + +@td.skip_if_no("pyarrow", min_version="0.16.0") +def test_arrow_from_arrow_uint(): + # https://github.com/pandas-dev/pandas/issues/31896 + # possible mismatch in types + import pyarrow as pa + + dtype = pd.UInt32Dtype() + result = dtype.__from_arrow__(pa.array([1, 2, 3, 4, None], type="int64")) + expected = pd.array([1, 2, 3, 4, None], dtype="UInt32") + + tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py new file mode 100644 index 0000000000000..a02501e2dcbf2 --- /dev/null +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -0,0 +1,251 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.generic import ABCIndexClass + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import integer_array +from pandas.core.arrays.integer import Int8Dtype, UInt32Dtype + + +def test_dtypes(dtype): + # smoke tests on auto dtype construction + + if dtype.is_signed_integer: + assert np.dtype(dtype.type).kind == "i" + else: + assert np.dtype(dtype.type).kind == "u" + assert dtype.name is not None + + +@pytest.mark.parametrize("op", ["sum", "min", "max", "prod"]) +def test_preserve_dtypes(op): + # TODO(#22346): preserve Int64 dtype + # for ops that enable (mean would actually work here + # but generally it is a float return value) + df = pd.DataFrame( + { + "A": ["a", "b", "b"], + "B": [1, None, 3], + "C": integer_array([1, None, 3], dtype="Int64"), + } + ) + + # op + result = getattr(df.C, op)() + if op in {"sum", "prod", "min", "max"}: + assert isinstance(result, np.int64) + else: + assert isinstance(result, int) + + # groupby + result = getattr(df.groupby("A"), op)() + + expected = pd.DataFrame( + {"B": np.array([1.0, 3.0]), "C": integer_array([1, 3], dtype="Int64")}, + index=pd.Index(["a", "b"], name="A"), + ) + tm.assert_frame_equal(result, expected) + + +def test_astype_nansafe(): + # see gh-22343 + arr = integer_array([np.nan, 1, 2], dtype="Int8") + msg = "cannot convert to 'uint32'-dtype NumPy array with missing values." + + with pytest.raises(ValueError, match=msg): + arr.astype("uint32") + + +@pytest.mark.parametrize("dropna", [True, False]) +def test_construct_index(all_data, dropna): + # ensure that we do not coerce to Float64Index, rather + # keep as Index + + all_data = all_data[:10] + if dropna: + other = np.array(all_data[~all_data.isna()]) + else: + other = all_data + + result = pd.Index(integer_array(other, dtype=all_data.dtype)) + expected = pd.Index(other, dtype=object) + + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("dropna", [True, False]) +def test_astype_index(all_data, dropna): + # as an int/uint index to Index + + all_data = all_data[:10] + if dropna: + other = all_data[~all_data.isna()] + else: + other = all_data + + dtype = all_data.dtype + idx = pd.Index(np.array(other)) + assert isinstance(idx, ABCIndexClass) + + result = idx.astype(dtype) + expected = idx.astype(object).astype(dtype) + tm.assert_index_equal(result, expected) + + +def test_astype(all_data): + all_data = all_data[:10] + + ints = all_data[~all_data.isna()] + mixed = all_data + dtype = Int8Dtype() + + # coerce to same type - ints + s = pd.Series(ints) + result = s.astype(all_data.dtype) + expected = pd.Series(ints) + tm.assert_series_equal(result, expected) + + # coerce to same other - ints + s = pd.Series(ints) + result = s.astype(dtype) + expected = pd.Series(ints, dtype=dtype) + tm.assert_series_equal(result, expected) + + # coerce to same numpy_dtype - ints + s = pd.Series(ints) + result = s.astype(all_data.dtype.numpy_dtype) + expected = pd.Series(ints._data.astype(all_data.dtype.numpy_dtype)) + tm.assert_series_equal(result, expected) + + # coerce to same type - mixed + s = pd.Series(mixed) + result = s.astype(all_data.dtype) + expected = pd.Series(mixed) + tm.assert_series_equal(result, expected) + + # coerce to same other - mixed + s = pd.Series(mixed) + result = s.astype(dtype) + expected = pd.Series(mixed, dtype=dtype) + tm.assert_series_equal(result, expected) + + # coerce to same numpy_dtype - mixed + s = pd.Series(mixed) + msg = r"cannot convert to .*-dtype NumPy array with missing values.*" + with pytest.raises(ValueError, match=msg): + s.astype(all_data.dtype.numpy_dtype) + + # coerce to object + s = pd.Series(mixed) + result = s.astype("object") + expected = pd.Series(np.asarray(mixed)) + tm.assert_series_equal(result, expected) + + +def test_astype_to_larger_numpy(): + a = pd.array([1, 2], dtype="Int32") + result = a.astype("int64") + expected = np.array([1, 2], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + a = pd.array([1, 2], dtype="UInt32") + result = a.astype("uint64") + expected = np.array([1, 2], dtype="uint64") + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("dtype", [Int8Dtype(), "Int8", UInt32Dtype(), "UInt32"]) +def test_astype_specific_casting(dtype): + s = pd.Series([1, 2, 3], dtype="Int64") + result = s.astype(dtype) + expected = pd.Series([1, 2, 3], dtype=dtype) + tm.assert_series_equal(result, expected) + + s = pd.Series([1, 2, 3, None], dtype="Int64") + result = s.astype(dtype) + expected = pd.Series([1, 2, 3, None], dtype=dtype) + tm.assert_series_equal(result, expected) + + +def test_astype_dt64(): + # GH#32435 + arr = pd.array([1, 2, 3, pd.NA]) * 10 ** 9 + + result = arr.astype("datetime64[ns]") + + expected = np.array([1, 2, 3, "NaT"], dtype="M8[s]").astype("M8[ns]") + tm.assert_numpy_array_equal(result, expected) + + +def test_construct_cast_invalid(dtype): + + msg = "cannot safely" + arr = [1.2, 2.3, 3.7] + with pytest.raises(TypeError, match=msg): + integer_array(arr, dtype=dtype) + + with pytest.raises(TypeError, match=msg): + pd.Series(arr).astype(dtype) + + arr = [1.2, 2.3, 3.7, np.nan] + with pytest.raises(TypeError, match=msg): + integer_array(arr, dtype=dtype) + + with pytest.raises(TypeError, match=msg): + pd.Series(arr).astype(dtype) + + +@pytest.mark.parametrize("in_series", [True, False]) +def test_to_numpy_na_nan(in_series): + a = pd.array([0, 1, None], dtype="Int64") + if in_series: + a = pd.Series(a) + + result = a.to_numpy(dtype="float64", na_value=np.nan) + expected = np.array([0.0, 1.0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + result = a.to_numpy(dtype="int64", na_value=-1) + expected = np.array([0, 1, -1], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = a.to_numpy(dtype="bool", na_value=False) + expected = np.array([False, True, False], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("in_series", [True, False]) +@pytest.mark.parametrize("dtype", ["int32", "int64", "bool"]) +def test_to_numpy_dtype(dtype, in_series): + a = pd.array([0, 1], dtype="Int64") + if in_series: + a = pd.Series(a) + + result = a.to_numpy(dtype=dtype) + expected = np.array([0, 1], dtype=dtype) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["float64", "int64", "bool"]) +def test_to_numpy_na_raises(dtype): + a = pd.array([0, 1, None], dtype="Int64") + with pytest.raises(ValueError, match=dtype): + a.to_numpy(dtype=dtype) + + +def test_astype_str(): + a = pd.array([1, 2, None], dtype="Int64") + expected = np.array(["1", "2", ""], dtype=object) + + tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_numpy_array_equal(a.astype("str"), expected) + + +def test_astype_boolean(): + # https://github.com/pandas-dev/pandas/issues/31102 + a = pd.array([1, 0, -1, 2, None], dtype="Int64") + result = a.astype("boolean") + expected = pd.array([True, False, True, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py new file mode 100644 index 0000000000000..bdf902d1aca62 --- /dev/null +++ b/pandas/tests/arrays/integer/test_function.py @@ -0,0 +1,120 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import integer_array + + +@pytest.mark.parametrize("ufunc", [np.abs, np.sign]) +# np.sign emits a warning with nans, +@pytest.mark.filterwarnings("ignore:invalid value encountered in sign") +def test_ufuncs_single_int(ufunc): + a = integer_array([1, 2, -3, np.nan]) + result = ufunc(a) + expected = integer_array(ufunc(a.astype(float))) + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = ufunc(s) + expected = pd.Series(integer_array(ufunc(a.astype(float)))) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt]) +def test_ufuncs_single_float(ufunc): + a = integer_array([1, 2, -3, np.nan]) + with np.errstate(invalid="ignore"): + result = ufunc(a) + expected = ufunc(a.astype(float)) + tm.assert_numpy_array_equal(result, expected) + + s = pd.Series(a) + with np.errstate(invalid="ignore"): + result = ufunc(s) + expected = ufunc(s.astype(float)) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", [np.add, np.subtract]) +def test_ufuncs_binary_int(ufunc): + # two IntegerArrays + a = integer_array([1, 2, -3, np.nan]) + result = ufunc(a, a) + expected = integer_array(ufunc(a.astype(float), a.astype(float))) + tm.assert_extension_array_equal(result, expected) + + # IntegerArray with numpy array + arr = np.array([1, 2, 3, 4]) + result = ufunc(a, arr) + expected = integer_array(ufunc(a.astype(float), arr)) + tm.assert_extension_array_equal(result, expected) + + result = ufunc(arr, a) + expected = integer_array(ufunc(arr, a.astype(float))) + tm.assert_extension_array_equal(result, expected) + + # IntegerArray with scalar + result = ufunc(a, 1) + expected = integer_array(ufunc(a.astype(float), 1)) + tm.assert_extension_array_equal(result, expected) + + result = ufunc(1, a) + expected = integer_array(ufunc(1, a.astype(float))) + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize("values", [[0, 1], [0, None]]) +def test_ufunc_reduce_raises(values): + a = integer_array(values) + msg = r"The 'reduce' method is not supported." + with pytest.raises(NotImplementedError, match=msg): + np.add.reduce(a) + + +@pytest.mark.parametrize( + "pandasmethname, kwargs", + [ + ("var", {"ddof": 0}), + ("var", {"ddof": 1}), + ("kurtosis", {}), + ("skew", {}), + ("sem", {}), + ], +) +def test_stat_method(pandasmethname, kwargs): + s = pd.Series(data=[1, 2, 3, 4, 5, 6, np.nan, np.nan], dtype="Int64") + pandasmeth = getattr(s, pandasmethname) + result = pandasmeth(**kwargs) + s2 = pd.Series(data=[1, 2, 3, 4, 5, 6], dtype="Int64") + pandasmeth = getattr(s2, pandasmethname) + expected = pandasmeth(**kwargs) + assert expected == result + + +def test_value_counts_na(): + arr = pd.array([1, 2, 1, pd.NA], dtype="Int64") + result = arr.value_counts(dropna=False) + expected = pd.Series([2, 1, 1], index=[1, 2, pd.NA], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([2, 1], index=[1, 2], dtype="Int64") + tm.assert_series_equal(result, expected) + + +def test_value_counts_empty(): + # https://github.com/pandas-dev/pandas/issues/33317 + s = pd.Series([], dtype="Int64") + result = s.value_counts() + # TODO: The dtype of the index seems wrong (it's int64 for non-empty) + idx = pd.Index([], dtype="object") + expected = pd.Series([], index=idx, dtype="Int64") + tm.assert_series_equal(result, expected) + + +# TODO(jreback) - these need testing / are broken + +# shift + +# set_index (destroys type) diff --git a/pandas/tests/arrays/integer/test_indexing.py b/pandas/tests/arrays/integer/test_indexing.py new file mode 100644 index 0000000000000..4b953d699108b --- /dev/null +++ b/pandas/tests/arrays/integer/test_indexing.py @@ -0,0 +1,19 @@ +import pandas as pd +import pandas._testing as tm + + +def test_array_setitem_nullable_boolean_mask(): + # GH 31446 + ser = pd.Series([1, 2], dtype="Int64") + result = ser.where(ser > 1) + expected = pd.Series([pd.NA, 2], dtype="Int64") + tm.assert_series_equal(result, expected) + + +def test_array_setitem(): + # GH 31446 + arr = pd.Series([1, 2], dtype="Int64").array + arr[arr > 1] = 1 + + expected = pd.array([1, 1], dtype="Int64") + tm.assert_extension_array_equal(arr, expected) diff --git a/pandas/tests/arrays/integer/test_repr.py b/pandas/tests/arrays/integer/test_repr.py new file mode 100644 index 0000000000000..bdc5724e85e0d --- /dev/null +++ b/pandas/tests/arrays/integer/test_repr.py @@ -0,0 +1,69 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas.core.arrays import integer_array +from pandas.core.arrays.integer import ( + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, +) + + +def test_dtypes(dtype): + # smoke tests on auto dtype construction + + if dtype.is_signed_integer: + assert np.dtype(dtype.type).kind == "i" + else: + assert np.dtype(dtype.type).kind == "u" + assert dtype.name is not None + + +@pytest.mark.parametrize( + "dtype, expected", + [ + (Int8Dtype(), "Int8Dtype()"), + (Int16Dtype(), "Int16Dtype()"), + (Int32Dtype(), "Int32Dtype()"), + (Int64Dtype(), "Int64Dtype()"), + (UInt8Dtype(), "UInt8Dtype()"), + (UInt16Dtype(), "UInt16Dtype()"), + (UInt32Dtype(), "UInt32Dtype()"), + (UInt64Dtype(), "UInt64Dtype()"), + ], +) +def test_repr_dtype(dtype, expected): + assert repr(dtype) == expected + + +def test_repr_array(): + result = repr(integer_array([1, None, 3])) + expected = "\n[1, , 3]\nLength: 3, dtype: Int64" + assert result == expected + + +def test_repr_array_long(): + data = integer_array([1, 2, None] * 1000) + expected = ( + "\n" + "[ 1, 2, , 1, 2, , 1, 2, , 1,\n" + " ...\n" + " , 1, 2, , 1, 2, , 1, 2, ]\n" + "Length: 3000, dtype: Int64" + ) + result = repr(data) + assert result == expected + + +def test_frame_repr(data_missing): + + df = pd.DataFrame({"A": data_missing}) + result = repr(df) + expected = " A\n0 \n1 1" + assert result == expected diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 7e7762d8973a0..fef11f0ff3bb2 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -104,6 +104,13 @@ class TestSetitem: def test_set_na(self, left_right_dtypes): left, right = left_right_dtypes result = IntervalArray.from_arrays(left, right) + + if result.dtype.subtype.kind in ["i", "u"]: + msg = "Cannot set float NaN to integer-backed IntervalArray" + with pytest.raises(ValueError, match=msg): + result[0] = np.NaN + return + result[0] = np.nan expected_left = Index([left._na_value] + list(left[1:])) @@ -182,7 +189,7 @@ def test_arrow_array_missing(): import pyarrow as pa from pandas.core.arrays._arrow_utils import ArrowIntervalType - arr = IntervalArray.from_breaks([0, 1, 2, 3]) + arr = IntervalArray.from_breaks([0.0, 1.0, 2.0, 3.0]) arr[1] = None result = pa.array(arr) @@ -209,8 +216,8 @@ def test_arrow_array_missing(): @pyarrow_skip @pytest.mark.parametrize( "breaks", - [[0, 1, 2, 3], pd.date_range("2017", periods=4, freq="D")], - ids=["int", "datetime64[ns]"], + [[0.0, 1.0, 2.0, 3.0], pd.date_range("2017", periods=4, freq="D")], + ids=["float", "datetime64[ns]"], ) def test_arrow_table_roundtrip(breaks): import pyarrow as pa diff --git a/pandas/tests/arrays/interval/test_ops.py b/pandas/tests/arrays/interval/test_ops.py index b4de80dc00a4e..9c78c2a48b9ff 100644 --- a/pandas/tests/arrays/interval/test_ops.py +++ b/pandas/tests/arrays/interval/test_ops.py @@ -57,7 +57,7 @@ def test_overlaps_interval_container(self, constructor, other_constructor): # TODO: modify this test when implemented interval_container = constructor.from_breaks(range(5)) other_container = other_constructor.from_breaks(range(5)) - with pytest.raises(NotImplementedError): + with pytest.raises(NotImplementedError, match="^$"): interval_container.overlaps(other_container) def test_overlaps_na(self, constructor, start_shift): diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index baca18239b929..cb3a70e934dcb 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -96,6 +96,22 @@ def test_constructor_na_dtype(self, dtype): with pytest.raises(ValueError, match="Cannot convert"): SparseArray([0, 1, np.nan], dtype=dtype) + def test_constructor_warns_when_losing_timezone(self): + # GH#32501 warn when losing timezone inforamtion + dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") + + expected = SparseArray(np.asarray(dti, dtype="datetime64[ns]")) + + with tm.assert_produces_warning(UserWarning): + result = SparseArray(dti) + + tm.assert_sp_array_equal(result, expected) + + with tm.assert_produces_warning(UserWarning): + result = SparseArray(pd.Series(dti)) + + tm.assert_sp_array_equal(result, expected) + def test_constructor_spindex_dtype(self): arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2])) # XXX: Behavior change: specifying SparseIndex no longer changes the @@ -1102,7 +1118,7 @@ def test_nbytes_block(self): arr = SparseArray([1, 2, 0, 0, 0], kind="block") result = arr.nbytes # (2 * 8) + 4 + 4 - # sp_values, blocs, blenghts + # sp_values, blocs, blengths assert result == 24 def test_asarray_datetime64(self): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 5e2f14af341ab..fe770eed84b62 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -214,6 +214,14 @@ def test_from_sequence_no_mutate(copy): tm.assert_numpy_array_equal(a, original) +def test_astype_int(): + arr = pd.array(["1", pd.NA, "3"], dtype="string") + + result = arr.astype("Int64") + expected = pd.array([1, pd.NA, 3], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.xfail(reason="Not implemented StringArray.sum") def test_reduce(skipna): diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index b1b5a9482e34f..ad6e6e4a98057 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -222,6 +222,8 @@ def test_array_copy(): # integer ([1, 2], IntegerArray._from_sequence([1, 2])), ([1, None], IntegerArray._from_sequence([1, None])), + ([1, pd.NA], IntegerArray._from_sequence([1, pd.NA])), + ([1, np.nan], IntegerArray._from_sequence([1, np.nan])), # string (["a", "b"], StringArray._from_sequence(["a", "b"])), (["a", None], StringArray._from_sequence(["a", None])), @@ -291,7 +293,7 @@ class DecimalArray2(DecimalArray): @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): if isinstance(scalars, (pd.Series, pd.Index)): - raise TypeError + raise TypeError("scalars should not be of type pd.Series or pd.Index") return super()._from_sequence(scalars, dtype=dtype, copy=copy) @@ -301,7 +303,9 @@ def test_array_unboxes(index_or_series): data = box([decimal.Decimal("1"), decimal.Decimal("2")]) # make sure it works - with pytest.raises(TypeError): + with pytest.raises( + TypeError, match="scalars should not be of type pd.Series or pd.Index" + ): DecimalArray2._from_sequence(data) result = pd.array(data, dtype="decimal2") diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py deleted file mode 100644 index d14d6f3ff0c41..0000000000000 --- a/pandas/tests/arrays/test_boolean.py +++ /dev/null @@ -1,932 +0,0 @@ -import operator - -import numpy as np -import pytest - -import pandas.util._test_decorators as td - -import pandas as pd -import pandas._testing as tm -from pandas.arrays import BooleanArray -from pandas.core.arrays.boolean import coerce_to_array -from pandas.tests.extension.base import BaseOpsUtil - - -def make_data(): - return [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False] - - -@pytest.fixture -def dtype(): - return pd.BooleanDtype() - - -@pytest.fixture -def data(dtype): - return pd.array(make_data(), dtype=dtype) - - -def test_boolean_array_constructor(): - values = np.array([True, False, True, False], dtype="bool") - mask = np.array([False, False, False, True], dtype="bool") - - result = BooleanArray(values, mask) - expected = pd.array([True, False, True, None], dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - with pytest.raises(TypeError, match="values should be boolean numpy array"): - BooleanArray(values.tolist(), mask) - - with pytest.raises(TypeError, match="mask should be boolean numpy array"): - BooleanArray(values, mask.tolist()) - - with pytest.raises(TypeError, match="values should be boolean numpy array"): - BooleanArray(values.astype(int), mask) - - with pytest.raises(TypeError, match="mask should be boolean numpy array"): - BooleanArray(values, None) - - with pytest.raises(ValueError, match="values must be a 1D array"): - BooleanArray(values.reshape(1, -1), mask) - - with pytest.raises(ValueError, match="mask must be a 1D array"): - BooleanArray(values, mask.reshape(1, -1)) - - -def test_boolean_array_constructor_copy(): - values = np.array([True, False, True, False], dtype="bool") - mask = np.array([False, False, False, True], dtype="bool") - - result = BooleanArray(values, mask) - assert result._data is values - assert result._mask is mask - - result = BooleanArray(values, mask, copy=True) - assert result._data is not values - assert result._mask is not mask - - -def test_to_boolean_array(): - expected = BooleanArray( - np.array([True, False, True]), np.array([False, False, False]) - ) - - result = pd.array([True, False, True], dtype="boolean") - tm.assert_extension_array_equal(result, expected) - result = pd.array(np.array([True, False, True]), dtype="boolean") - tm.assert_extension_array_equal(result, expected) - result = pd.array(np.array([True, False, True], dtype=object), dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - # with missing values - expected = BooleanArray( - np.array([True, False, True]), np.array([False, False, True]) - ) - - result = pd.array([True, False, None], dtype="boolean") - tm.assert_extension_array_equal(result, expected) - result = pd.array(np.array([True, False, None], dtype=object), dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - -def test_to_boolean_array_all_none(): - expected = BooleanArray(np.array([True, True, True]), np.array([True, True, True])) - - result = pd.array([None, None, None], dtype="boolean") - tm.assert_extension_array_equal(result, expected) - result = pd.array(np.array([None, None, None], dtype=object), dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - -@pytest.mark.parametrize( - "a, b", - [ - ([True, False, None, np.nan, pd.NA], [True, False, None, None, None]), - ([True, np.nan], [True, None]), - ([True, pd.NA], [True, None]), - ([np.nan, np.nan], [None, None]), - (np.array([np.nan, np.nan], dtype=float), [None, None]), - ], -) -def test_to_boolean_array_missing_indicators(a, b): - result = pd.array(a, dtype="boolean") - expected = pd.array(b, dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - -@pytest.mark.parametrize( - "values", - [ - ["foo", "bar"], - ["1", "2"], - # "foo", - [1, 2], - [1.0, 2.0], - pd.date_range("20130101", periods=2), - np.array(["foo"]), - np.array([1, 2]), - np.array([1.0, 2.0]), - [np.nan, {"a": 1}], - ], -) -def test_to_boolean_array_error(values): - # error in converting existing arrays to BooleanArray - msg = "Need to pass bool-like value" - with pytest.raises(TypeError, match=msg): - pd.array(values, dtype="boolean") - - -def test_to_boolean_array_from_integer_array(): - result = pd.array(np.array([1, 0, 1, 0]), dtype="boolean") - expected = pd.array([True, False, True, False], dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - # with missing values - result = pd.array(np.array([1, 0, 1, None]), dtype="boolean") - expected = pd.array([True, False, True, None], dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - -def test_to_boolean_array_from_float_array(): - result = pd.array(np.array([1.0, 0.0, 1.0, 0.0]), dtype="boolean") - expected = pd.array([True, False, True, False], dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - # with missing values - result = pd.array(np.array([1.0, 0.0, 1.0, np.nan]), dtype="boolean") - expected = pd.array([True, False, True, None], dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - -def test_to_boolean_array_integer_like(): - # integers of 0's and 1's - result = pd.array([1, 0, 1, 0], dtype="boolean") - expected = pd.array([True, False, True, False], dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - # with missing values - result = pd.array([1, 0, 1, None], dtype="boolean") - expected = pd.array([True, False, True, None], dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - -def test_coerce_to_array(): - # TODO this is currently not public API - values = np.array([True, False, True, False], dtype="bool") - mask = np.array([False, False, False, True], dtype="bool") - result = BooleanArray(*coerce_to_array(values, mask=mask)) - expected = BooleanArray(values, mask) - tm.assert_extension_array_equal(result, expected) - assert result._data is values - assert result._mask is mask - result = BooleanArray(*coerce_to_array(values, mask=mask, copy=True)) - expected = BooleanArray(values, mask) - tm.assert_extension_array_equal(result, expected) - assert result._data is not values - assert result._mask is not mask - - # mixed missing from values and mask - values = [True, False, None, False] - mask = np.array([False, False, False, True], dtype="bool") - result = BooleanArray(*coerce_to_array(values, mask=mask)) - expected = BooleanArray( - np.array([True, False, True, True]), np.array([False, False, True, True]) - ) - tm.assert_extension_array_equal(result, expected) - result = BooleanArray(*coerce_to_array(np.array(values, dtype=object), mask=mask)) - tm.assert_extension_array_equal(result, expected) - result = BooleanArray(*coerce_to_array(values, mask=mask.tolist())) - tm.assert_extension_array_equal(result, expected) - - # raise errors for wrong dimension - values = np.array([True, False, True, False], dtype="bool") - mask = np.array([False, False, False, True], dtype="bool") - - with pytest.raises(ValueError, match="values must be a 1D list-like"): - coerce_to_array(values.reshape(1, -1)) - - with pytest.raises(ValueError, match="mask must be a 1D list-like"): - coerce_to_array(values, mask=mask.reshape(1, -1)) - - -def test_coerce_to_array_from_boolean_array(): - # passing BooleanArray to coerce_to_array - values = np.array([True, False, True, False], dtype="bool") - mask = np.array([False, False, False, True], dtype="bool") - arr = BooleanArray(values, mask) - result = BooleanArray(*coerce_to_array(arr)) - tm.assert_extension_array_equal(result, arr) - # no copy - assert result._data is arr._data - assert result._mask is arr._mask - - result = BooleanArray(*coerce_to_array(arr), copy=True) - tm.assert_extension_array_equal(result, arr) - assert result._data is not arr._data - assert result._mask is not arr._mask - - with pytest.raises(ValueError, match="cannot pass mask for BooleanArray input"): - coerce_to_array(arr, mask=mask) - - -def test_coerce_to_numpy_array(): - # with missing values -> object dtype - arr = pd.array([True, False, None], dtype="boolean") - result = np.array(arr) - expected = np.array([True, False, pd.NA], dtype="object") - tm.assert_numpy_array_equal(result, expected) - - # also with no missing values -> object dtype - arr = pd.array([True, False, True], dtype="boolean") - result = np.array(arr) - expected = np.array([True, False, True], dtype="object") - tm.assert_numpy_array_equal(result, expected) - - # force bool dtype - result = np.array(arr, dtype="bool") - expected = np.array([True, False, True], dtype="bool") - tm.assert_numpy_array_equal(result, expected) - # with missing values will raise error - arr = pd.array([True, False, None], dtype="boolean") - with pytest.raises(ValueError): - np.array(arr, dtype="bool") - - -def test_to_boolean_array_from_strings(): - result = BooleanArray._from_sequence_of_strings( - np.array(["True", "False", np.nan], dtype=object) - ) - expected = BooleanArray( - np.array([True, False, False]), np.array([False, False, True]) - ) - - tm.assert_extension_array_equal(result, expected) - - -def test_to_boolean_array_from_strings_invalid_string(): - with pytest.raises(ValueError, match="cannot be cast"): - BooleanArray._from_sequence_of_strings(["donkey"]) - - -def test_repr(): - df = pd.DataFrame({"A": pd.array([True, False, None], dtype="boolean")}) - expected = " A\n0 True\n1 False\n2 " - assert repr(df) == expected - - expected = "0 True\n1 False\n2 \nName: A, dtype: boolean" - assert repr(df.A) == expected - - expected = "\n[True, False, ]\nLength: 3, dtype: boolean" - assert repr(df.A.array) == expected - - -@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) -def test_to_numpy(box): - con = pd.Series if box else pd.array - # default (with or without missing values) -> object dtype - arr = con([True, False, True], dtype="boolean") - result = arr.to_numpy() - expected = np.array([True, False, True], dtype="object") - tm.assert_numpy_array_equal(result, expected) - - arr = con([True, False, None], dtype="boolean") - result = arr.to_numpy() - expected = np.array([True, False, pd.NA], dtype="object") - tm.assert_numpy_array_equal(result, expected) - - arr = con([True, False, None], dtype="boolean") - result = arr.to_numpy(dtype="str") - expected = np.array([True, False, pd.NA], dtype=" can convert to bool, otherwise raises - arr = con([True, False, True], dtype="boolean") - result = arr.to_numpy(dtype="bool") - expected = np.array([True, False, True], dtype="bool") - tm.assert_numpy_array_equal(result, expected) - - arr = con([True, False, None], dtype="boolean") - with pytest.raises(ValueError, match="cannot convert to 'bool'-dtype"): - result = arr.to_numpy(dtype="bool") - - # specify dtype and na_value - arr = con([True, False, None], dtype="boolean") - result = arr.to_numpy(dtype=object, na_value=None) - expected = np.array([True, False, None], dtype="object") - tm.assert_numpy_array_equal(result, expected) - - result = arr.to_numpy(dtype=bool, na_value=False) - expected = np.array([True, False, False], dtype="bool") - tm.assert_numpy_array_equal(result, expected) - - result = arr.to_numpy(dtype="int64", na_value=-99) - expected = np.array([1, 0, -99], dtype="int64") - tm.assert_numpy_array_equal(result, expected) - - result = arr.to_numpy(dtype="float64", na_value=np.nan) - expected = np.array([1, 0, np.nan], dtype="float64") - tm.assert_numpy_array_equal(result, expected) - - # converting to int or float without specifying na_value raises - with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"): - arr.to_numpy(dtype="int64") - with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"): - arr.to_numpy(dtype="float64") - - -def test_to_numpy_copy(): - # to_numpy can be zero-copy if no missing values - arr = pd.array([True, False, True], dtype="boolean") - result = arr.to_numpy(dtype=bool) - result[0] = False - tm.assert_extension_array_equal( - arr, pd.array([False, False, True], dtype="boolean") - ) - - arr = pd.array([True, False, True], dtype="boolean") - result = arr.to_numpy(dtype=bool, copy=True) - result[0] = False - tm.assert_extension_array_equal(arr, pd.array([True, False, True], dtype="boolean")) - - -def test_astype(): - # with missing values - arr = pd.array([True, False, None], dtype="boolean") - - with pytest.raises(ValueError, match="cannot convert NA to integer"): - arr.astype("int64") - - with pytest.raises(ValueError, match="cannot convert float NaN to"): - arr.astype("bool") - - result = arr.astype("float64") - expected = np.array([1, 0, np.nan], dtype="float64") - tm.assert_numpy_array_equal(result, expected) - - result = arr.astype("str") - expected = np.array(["True", "False", ""], dtype="object") - tm.assert_numpy_array_equal(result, expected) - - # no missing values - arr = pd.array([True, False, True], dtype="boolean") - result = arr.astype("int64") - expected = np.array([1, 0, 1], dtype="int64") - tm.assert_numpy_array_equal(result, expected) - - result = arr.astype("bool") - expected = np.array([True, False, True], dtype="bool") - tm.assert_numpy_array_equal(result, expected) - - -def test_astype_to_boolean_array(): - # astype to BooleanArray - arr = pd.array([True, False, None], dtype="boolean") - - result = arr.astype("boolean") - tm.assert_extension_array_equal(result, arr) - result = arr.astype(pd.BooleanDtype()) - tm.assert_extension_array_equal(result, arr) - - -def test_astype_to_integer_array(): - # astype to IntegerArray - arr = pd.array([True, False, None], dtype="boolean") - - result = arr.astype("Int64") - expected = pd.array([1, 0, None], dtype="Int64") - tm.assert_extension_array_equal(result, expected) - - -@pytest.mark.parametrize("na", [None, np.nan, pd.NA]) -def test_setitem_missing_values(na): - arr = pd.array([True, False, None], dtype="boolean") - expected = pd.array([True, None, None], dtype="boolean") - arr[1] = na - tm.assert_extension_array_equal(arr, expected) - - -@pytest.mark.parametrize( - "ufunc", [np.add, np.logical_or, np.logical_and, np.logical_xor] -) -def test_ufuncs_binary(ufunc): - # two BooleanArrays - a = pd.array([True, False, None], dtype="boolean") - result = ufunc(a, a) - expected = pd.array(ufunc(a._data, a._data), dtype="boolean") - expected[a._mask] = np.nan - tm.assert_extension_array_equal(result, expected) - - s = pd.Series(a) - result = ufunc(s, a) - expected = pd.Series(ufunc(a._data, a._data), dtype="boolean") - expected[a._mask] = np.nan - tm.assert_series_equal(result, expected) - - # Boolean with numpy array - arr = np.array([True, True, False]) - result = ufunc(a, arr) - expected = pd.array(ufunc(a._data, arr), dtype="boolean") - expected[a._mask] = np.nan - tm.assert_extension_array_equal(result, expected) - - result = ufunc(arr, a) - expected = pd.array(ufunc(arr, a._data), dtype="boolean") - expected[a._mask] = np.nan - tm.assert_extension_array_equal(result, expected) - - # BooleanArray with scalar - result = ufunc(a, True) - expected = pd.array(ufunc(a._data, True), dtype="boolean") - expected[a._mask] = np.nan - tm.assert_extension_array_equal(result, expected) - - result = ufunc(True, a) - expected = pd.array(ufunc(True, a._data), dtype="boolean") - expected[a._mask] = np.nan - tm.assert_extension_array_equal(result, expected) - - # not handled types - with pytest.raises(TypeError): - ufunc(a, "test") - - -@pytest.mark.parametrize("ufunc", [np.logical_not]) -def test_ufuncs_unary(ufunc): - a = pd.array([True, False, None], dtype="boolean") - result = ufunc(a) - expected = pd.array(ufunc(a._data), dtype="boolean") - expected[a._mask] = np.nan - tm.assert_extension_array_equal(result, expected) - - s = pd.Series(a) - result = ufunc(s) - expected = pd.Series(ufunc(a._data), dtype="boolean") - expected[a._mask] = np.nan - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("values", [[True, False], [True, None]]) -def test_ufunc_reduce_raises(values): - a = pd.array(values, dtype="boolean") - with pytest.raises(NotImplementedError): - np.add.reduce(a) - - -class TestUnaryOps: - def test_invert(self): - a = pd.array([True, False, None], dtype="boolean") - expected = pd.array([False, True, None], dtype="boolean") - tm.assert_extension_array_equal(~a, expected) - - expected = pd.Series(expected, index=["a", "b", "c"], name="name") - result = ~pd.Series(a, index=["a", "b", "c"], name="name") - tm.assert_series_equal(result, expected) - - df = pd.DataFrame({"A": a, "B": [True, False, False]}, index=["a", "b", "c"]) - result = ~df - expected = pd.DataFrame( - {"A": expected, "B": [False, True, True]}, index=["a", "b", "c"] - ) - tm.assert_frame_equal(result, expected) - - -class TestLogicalOps(BaseOpsUtil): - def test_numpy_scalars_ok(self, all_logical_operators): - a = pd.array([True, False, None], dtype="boolean") - op = getattr(a, all_logical_operators) - - tm.assert_extension_array_equal(op(True), op(np.bool(True))) - tm.assert_extension_array_equal(op(False), op(np.bool(False))) - - def get_op_from_name(self, op_name): - short_opname = op_name.strip("_") - short_opname = short_opname if "xor" in short_opname else short_opname + "_" - try: - op = getattr(operator, short_opname) - except AttributeError: - # Assume it is the reverse operator - rop = getattr(operator, short_opname[1:]) - op = lambda x, y: rop(y, x) - - return op - - def test_empty_ok(self, all_logical_operators): - a = pd.array([], dtype="boolean") - op_name = all_logical_operators - result = getattr(a, op_name)(True) - tm.assert_extension_array_equal(a, result) - - result = getattr(a, op_name)(False) - tm.assert_extension_array_equal(a, result) - - # TODO: pd.NA - # result = getattr(a, op_name)(pd.NA) - # tm.assert_extension_array_equal(a, result) - - def test_logical_length_mismatch_raises(self, all_logical_operators): - op_name = all_logical_operators - a = pd.array([True, False, None], dtype="boolean") - msg = "Lengths must match to compare" - - with pytest.raises(ValueError, match=msg): - getattr(a, op_name)([True, False]) - - with pytest.raises(ValueError, match=msg): - getattr(a, op_name)(np.array([True, False])) - - with pytest.raises(ValueError, match=msg): - getattr(a, op_name)(pd.array([True, False], dtype="boolean")) - - def test_logical_nan_raises(self, all_logical_operators): - op_name = all_logical_operators - a = pd.array([True, False, None], dtype="boolean") - msg = "Got float instead" - - with pytest.raises(TypeError, match=msg): - getattr(a, op_name)(np.nan) - - @pytest.mark.parametrize("other", ["a", 1]) - def test_non_bool_or_na_other_raises(self, other, all_logical_operators): - a = pd.array([True, False], dtype="boolean") - with pytest.raises(TypeError, match=str(type(other).__name__)): - getattr(a, all_logical_operators)(other) - - def test_kleene_or(self): - # A clear test of behavior. - a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") - b = pd.array([True, False, None] * 3, dtype="boolean") - result = a | b - expected = pd.array( - [True, True, True, True, False, None, True, None, None], dtype="boolean" - ) - tm.assert_extension_array_equal(result, expected) - - result = b | a - tm.assert_extension_array_equal(result, expected) - - # ensure we haven't mutated anything inplace - tm.assert_extension_array_equal( - a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") - ) - tm.assert_extension_array_equal( - b, pd.array([True, False, None] * 3, dtype="boolean") - ) - - @pytest.mark.parametrize( - "other, expected", - [ - (pd.NA, [True, None, None]), - (True, [True, True, True]), - (np.bool_(True), [True, True, True]), - (False, [True, False, None]), - (np.bool_(False), [True, False, None]), - ], - ) - def test_kleene_or_scalar(self, other, expected): - # TODO: test True & False - a = pd.array([True, False, None], dtype="boolean") - result = a | other - expected = pd.array(expected, dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - result = other | a - tm.assert_extension_array_equal(result, expected) - - # ensure we haven't mutated anything inplace - tm.assert_extension_array_equal( - a, pd.array([True, False, None], dtype="boolean") - ) - - def test_kleene_and(self): - # A clear test of behavior. - a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") - b = pd.array([True, False, None] * 3, dtype="boolean") - result = a & b - expected = pd.array( - [True, False, None, False, False, False, None, False, None], dtype="boolean" - ) - tm.assert_extension_array_equal(result, expected) - - result = b & a - tm.assert_extension_array_equal(result, expected) - - # ensure we haven't mutated anything inplace - tm.assert_extension_array_equal( - a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") - ) - tm.assert_extension_array_equal( - b, pd.array([True, False, None] * 3, dtype="boolean") - ) - - @pytest.mark.parametrize( - "other, expected", - [ - (pd.NA, [None, False, None]), - (True, [True, False, None]), - (False, [False, False, False]), - (np.bool_(True), [True, False, None]), - (np.bool_(False), [False, False, False]), - ], - ) - def test_kleene_and_scalar(self, other, expected): - a = pd.array([True, False, None], dtype="boolean") - result = a & other - expected = pd.array(expected, dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - result = other & a - tm.assert_extension_array_equal(result, expected) - - # ensure we haven't mutated anything inplace - tm.assert_extension_array_equal( - a, pd.array([True, False, None], dtype="boolean") - ) - - def test_kleene_xor(self): - a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") - b = pd.array([True, False, None] * 3, dtype="boolean") - result = a ^ b - expected = pd.array( - [False, True, None, True, False, None, None, None, None], dtype="boolean" - ) - tm.assert_extension_array_equal(result, expected) - - result = b ^ a - tm.assert_extension_array_equal(result, expected) - - # ensure we haven't mutated anything inplace - tm.assert_extension_array_equal( - a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") - ) - tm.assert_extension_array_equal( - b, pd.array([True, False, None] * 3, dtype="boolean") - ) - - @pytest.mark.parametrize( - "other, expected", - [ - (pd.NA, [None, None, None]), - (True, [False, True, None]), - (np.bool_(True), [False, True, None]), - (np.bool_(False), [True, False, None]), - ], - ) - def test_kleene_xor_scalar(self, other, expected): - a = pd.array([True, False, None], dtype="boolean") - result = a ^ other - expected = pd.array(expected, dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - result = other ^ a - tm.assert_extension_array_equal(result, expected) - - # ensure we haven't mutated anything inplace - tm.assert_extension_array_equal( - a, pd.array([True, False, None], dtype="boolean") - ) - - @pytest.mark.parametrize( - "other", [True, False, pd.NA, [True, False, None] * 3], - ) - def test_no_masked_assumptions(self, other, all_logical_operators): - # The logical operations should not assume that masked values are False! - a = pd.arrays.BooleanArray( - np.array([True, True, True, False, False, False, True, False, True]), - np.array([False] * 6 + [True, True, True]), - ) - b = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") - if isinstance(other, list): - other = pd.array(other, dtype="boolean") - - result = getattr(a, all_logical_operators)(other) - expected = getattr(b, all_logical_operators)(other) - tm.assert_extension_array_equal(result, expected) - - if isinstance(other, BooleanArray): - other._data[other._mask] = True - a._data[a._mask] = False - - result = getattr(a, all_logical_operators)(other) - expected = getattr(b, all_logical_operators)(other) - tm.assert_extension_array_equal(result, expected) - - -class TestComparisonOps(BaseOpsUtil): - def _compare_other(self, data, op_name, other): - op = self.get_op_from_name(op_name) - - # array - result = pd.Series(op(data, other)) - expected = pd.Series(op(data._data, other), dtype="boolean") - # propagate NAs - expected[data._mask] = pd.NA - - tm.assert_series_equal(result, expected) - - # series - s = pd.Series(data) - result = op(s, other) - - expected = pd.Series(data._data) - expected = op(expected, other) - expected = expected.astype("boolean") - # propagate NAs - expected[data._mask] = pd.NA - - tm.assert_series_equal(result, expected) - - def test_compare_scalar(self, data, all_compare_operators): - op_name = all_compare_operators - self._compare_other(data, op_name, True) - - def test_compare_array(self, data, all_compare_operators): - op_name = all_compare_operators - other = pd.array([True] * len(data), dtype="boolean") - self._compare_other(data, op_name, other) - other = np.array([True] * len(data)) - self._compare_other(data, op_name, other) - other = pd.Series([True] * len(data)) - self._compare_other(data, op_name, other) - - @pytest.mark.parametrize("other", [True, False, pd.NA]) - def test_scalar(self, other, all_compare_operators): - op = self.get_op_from_name(all_compare_operators) - a = pd.array([True, False, None], dtype="boolean") - - result = op(a, other) - - if other is pd.NA: - expected = pd.array([None, None, None], dtype="boolean") - else: - values = op(a._data, other) - expected = BooleanArray(values, a._mask, copy=True) - tm.assert_extension_array_equal(result, expected) - - # ensure we haven't mutated anything inplace - result[0] = None - tm.assert_extension_array_equal( - a, pd.array([True, False, None], dtype="boolean") - ) - - def test_array(self, all_compare_operators): - op = self.get_op_from_name(all_compare_operators) - a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") - b = pd.array([True, False, None] * 3, dtype="boolean") - - result = op(a, b) - - values = op(a._data, b._data) - mask = a._mask | b._mask - expected = BooleanArray(values, mask) - tm.assert_extension_array_equal(result, expected) - - # ensure we haven't mutated anything inplace - result[0] = None - tm.assert_extension_array_equal( - a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") - ) - tm.assert_extension_array_equal( - b, pd.array([True, False, None] * 3, dtype="boolean") - ) - - -class TestArithmeticOps(BaseOpsUtil): - def test_error(self, data, all_arithmetic_operators): - # invalid ops - - op = all_arithmetic_operators - s = pd.Series(data) - ops = getattr(s, op) - opa = getattr(data, op) - - # invalid scalars - with pytest.raises(TypeError): - ops("foo") - with pytest.raises(TypeError): - ops(pd.Timestamp("20180101")) - - # invalid array-likes - if op not in ("__mul__", "__rmul__"): - # TODO(extension) numpy's mul with object array sees booleans as numbers - with pytest.raises(TypeError): - ops(pd.Series("foo", index=s.index)) - - # 2d - result = opa(pd.DataFrame({"A": s})) - assert result is NotImplemented - - with pytest.raises(NotImplementedError): - opa(np.arange(len(s)).reshape(-1, len(s))) - - -@pytest.mark.parametrize("dropna", [True, False]) -def test_reductions_return_types(dropna, data, all_numeric_reductions): - op = all_numeric_reductions - s = pd.Series(data) - if dropna: - s = s.dropna() - - if op in ("sum", "prod"): - assert isinstance(getattr(s, op)(), np.int64) - elif op in ("min", "max"): - assert isinstance(getattr(s, op)(), np.bool_) - else: - # "mean", "std", "var", "median", "kurt", "skew" - assert isinstance(getattr(s, op)(), np.float64) - - -@pytest.mark.parametrize( - "values, exp_any, exp_all, exp_any_noskip, exp_all_noskip", - [ - ([True, pd.NA], True, True, True, pd.NA), - ([False, pd.NA], False, False, pd.NA, False), - ([pd.NA], False, True, pd.NA, pd.NA), - ([], False, True, False, True), - ], -) -def test_any_all(values, exp_any, exp_all, exp_any_noskip, exp_all_noskip): - # the methods return numpy scalars - exp_any = pd.NA if exp_any is pd.NA else np.bool_(exp_any) - exp_all = pd.NA if exp_all is pd.NA else np.bool_(exp_all) - exp_any_noskip = pd.NA if exp_any_noskip is pd.NA else np.bool_(exp_any_noskip) - exp_all_noskip = pd.NA if exp_all_noskip is pd.NA else np.bool_(exp_all_noskip) - - for con in [pd.array, pd.Series]: - a = con(values, dtype="boolean") - assert a.any() is exp_any - assert a.all() is exp_all - assert a.any(skipna=False) is exp_any_noskip - assert a.all(skipna=False) is exp_all_noskip - - assert np.any(a.any()) is exp_any - assert np.all(a.all()) is exp_all - - -# TODO when BooleanArray coerces to object dtype numpy array, need to do conversion -# manually in the indexing code -# def test_indexing_boolean_mask(): -# arr = pd.array([1, 2, 3, 4], dtype="Int64") -# mask = pd.array([True, False, True, False], dtype="boolean") -# result = arr[mask] -# expected = pd.array([1, 3], dtype="Int64") -# tm.assert_extension_array_equal(result, expected) - -# # missing values -> error -# mask = pd.array([True, False, True, None], dtype="boolean") -# with pytest.raises(IndexError): -# result = arr[mask] - - -@td.skip_if_no("pyarrow", min_version="0.15.0") -def test_arrow_array(data): - # protocol added in 0.15.0 - import pyarrow as pa - - arr = pa.array(data) - - # TODO use to_numpy(na_value=None) here - data_object = np.array(data, dtype=object) - data_object[data.isna()] = None - expected = pa.array(data_object, type=pa.bool_(), from_pandas=True) - assert arr.equals(expected) - - -@td.skip_if_no("pyarrow", min_version="0.15.1.dev") -def test_arrow_roundtrip(): - # roundtrip possible from arrow 1.0.0 - import pyarrow as pa - - data = pd.array([True, False, None], dtype="boolean") - df = pd.DataFrame({"a": data}) - table = pa.table(df) - assert table.field("a").type == "bool" - result = table.to_pandas() - assert isinstance(result["a"].dtype, pd.BooleanDtype) - tm.assert_frame_equal(result, df) - - -def test_value_counts_na(): - arr = pd.array([True, False, pd.NA], dtype="boolean") - result = arr.value_counts(dropna=False) - expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64") - tm.assert_series_equal(result, expected) - - result = arr.value_counts(dropna=True) - expected = pd.Series([1, 1], index=[True, False], dtype="Int64") - tm.assert_series_equal(result, expected) - - -def test_diff(): - a = pd.array( - [True, True, False, False, True, None, True, None, False], dtype="boolean" - ) - result = pd.core.algorithms.diff(a, 1) - expected = pd.array( - [None, False, True, False, True, None, None, None, None], dtype="boolean" - ) - tm.assert_extension_array_equal(result, expected) - - s = pd.Series(a) - result = s.diff() - expected = pd.Series(expected) - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 17818b6ce689f..5b703cfe8fae5 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -10,7 +10,7 @@ import pandas._testing as tm from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray from pandas.core.indexes.datetimes import DatetimeIndex -from pandas.core.indexes.period import PeriodIndex +from pandas.core.indexes.period import Period, PeriodIndex from pandas.core.indexes.timedeltas import TimedeltaIndex @@ -60,13 +60,19 @@ def timedelta_index(request): class SharedTests: index_cls: Type[Union[DatetimeIndex, PeriodIndex, TimedeltaIndex]] + @pytest.fixture + def arr1d(self): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") + return arr + def test_compare_len1_raises(self): # make sure we raise when comparing with different lengths, specific # to the case where one has length-1, which numpy would broadcast data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 - idx = self.array_cls._simple_new(data, freq="D") - arr = self.index_cls(idx) + arr = self.array_cls._simple_new(data, freq="D") + idx = self.index_cls(arr) with pytest.raises(ValueError, match="Lengths must match"): arr == arr[:1] @@ -94,6 +100,16 @@ def test_take(self): tm.assert_index_equal(self.index_cls(result), expected) + @pytest.mark.parametrize("fill_value", [2, 2.0, pd.Timestamp.now().time]) + def test_take_fill_raises(self, fill_value): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + + arr = self.array_cls._simple_new(data, freq="D") + + msg = f"'fill_value' should be a {self.dtype}. Got '{fill_value}'" + with pytest.raises(ValueError, match=msg): + arr.take([0, 1], allow_fill=True, fill_value=fill_value) + def test_take_fill(self): data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 @@ -108,15 +124,6 @@ def test_take_fill(self): result = arr.take([-1, 1], allow_fill=True, fill_value=pd.NaT) assert result[0] is pd.NaT - with pytest.raises(ValueError): - arr.take([0, 1], allow_fill=True, fill_value=2) - - with pytest.raises(ValueError): - arr.take([0, 1], allow_fill=True, fill_value=2.0) - - with pytest.raises(ValueError): - arr.take([0, 1], allow_fill=True, fill_value=pd.Timestamp.now().time) - def test_concat_same_type(self): data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 @@ -139,7 +146,8 @@ def test_unbox_scalar(self): result = arr._unbox_scalar(pd.NaT) assert isinstance(result, int) - with pytest.raises(ValueError): + msg = f"'value' should be a {self.dtype.__name__}." + with pytest.raises(ValueError, match=msg): arr._unbox_scalar("foo") def test_check_compatible_with(self): @@ -202,6 +210,23 @@ def test_searchsorted(self): result = arr.searchsorted(pd.NaT) assert result == 0 + def test_getitem_2d(self, arr1d): + # 2d slicing on a 1D array + expected = type(arr1d)(arr1d._data[:, np.newaxis], dtype=arr1d.dtype) + result = arr1d[:, np.newaxis] + tm.assert_equal(result, expected) + + # Lookup on a 2D array + arr2d = expected + expected = type(arr2d)(arr2d._data[:3, 0], dtype=arr2d.dtype) + result = arr2d[:3, 0] + tm.assert_equal(result, expected) + + # Scalar lookup + result = arr2d[-1, 0] + expected = arr1d[-1] + assert result == expected + def test_setitem(self): data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 arr = self.array_cls(data, freq="D") @@ -240,10 +265,35 @@ def test_inplace_arithmetic(self): arr -= pd.Timedelta(days=1) tm.assert_equal(arr, expected) + def test_shift_fill_int_deprecated(self): + # GH#31971 + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = arr.shift(1, fill_value=1) + + expected = arr.copy() + if self.array_cls is PeriodArray: + fill_val = PeriodArray._scalar_type._from_ordinal(1, freq=arr.freq) + else: + fill_val = arr._scalar_type(1) + expected[0] = fill_val + expected[1:] = arr[:-1] + tm.assert_equal(result, expected) + class TestDatetimeArray(SharedTests): index_cls = pd.DatetimeIndex array_cls = DatetimeArray + dtype = pd.Timestamp + + @pytest.fixture + def arr1d(self, tz_naive_fixture): + tz = tz_naive_fixture + dti = pd.date_range("2016-01-01 01:01:00", periods=3, freq="H", tz=tz) + dta = dti._data + return dta def test_round(self, tz_naive_fixture): # GH#24064 @@ -435,23 +485,28 @@ def test_take_fill_valid(self, datetime_index, tz_naive_fixture): result = arr.take([-1, 1], allow_fill=True, fill_value=now) assert result[0] == now - with pytest.raises(ValueError): + msg = f"'fill_value' should be a {self.dtype}. Got '0 days 00:00:00'." + with pytest.raises(ValueError, match=msg): # fill_value Timedelta invalid arr.take([-1, 1], allow_fill=True, fill_value=now - now) - with pytest.raises(ValueError): + msg = f"'fill_value' should be a {self.dtype}. Got '2014Q1'." + with pytest.raises(ValueError, match=msg): # fill_value Period invalid arr.take([-1, 1], allow_fill=True, fill_value=pd.Period("2014Q1")) tz = None if dti.tz is not None else "US/Eastern" now = pd.Timestamp.now().tz_localize(tz) - with pytest.raises(TypeError): + msg = "Cannot compare tz-naive and tz-aware datetime-like objects" + with pytest.raises(TypeError, match=msg): # Timestamp with mismatched tz-awareness arr.take([-1, 1], allow_fill=True, fill_value=now) - with pytest.raises(ValueError): + value = pd.NaT.value + msg = f"'fill_value' should be a {self.dtype}. Got '{value}'." + with pytest.raises(ValueError, match=msg): # require NaT, not iNaT, as it could be confused with an integer - arr.take([-1, 1], allow_fill=True, fill_value=pd.NaT.value) + arr.take([-1, 1], allow_fill=True, fill_value=value) def test_concat_same_type_invalid(self, datetime_index): # different timezones @@ -503,6 +558,7 @@ def test_strftime_nat(self): class TestTimedeltaArray(SharedTests): index_cls = pd.TimedeltaIndex array_cls = TimedeltaArray + dtype = pd.Timedelta def test_from_tdi(self): tdi = pd.TimedeltaIndex(["1 Day", "3 Hours"]) @@ -601,18 +657,27 @@ def test_take_fill_valid(self, timedelta_index): assert result[0] == td1 now = pd.Timestamp.now() - with pytest.raises(ValueError): + value = now + msg = f"'fill_value' should be a {self.dtype}. Got '{value}'." + with pytest.raises(ValueError, match=msg): # fill_value Timestamp invalid - arr.take([0, 1], allow_fill=True, fill_value=now) + arr.take([0, 1], allow_fill=True, fill_value=value) - with pytest.raises(ValueError): + value = now.to_period("D") + msg = f"'fill_value' should be a {self.dtype}. Got '{value}'." + with pytest.raises(ValueError, match=msg): # fill_value Period invalid - arr.take([0, 1], allow_fill=True, fill_value=now.to_period("D")) + arr.take([0, 1], allow_fill=True, fill_value=value) class TestPeriodArray(SharedTests): index_cls = pd.PeriodIndex array_cls = PeriodArray + dtype = pd.Period + + @pytest.fixture + def arr1d(self, period_index): + return period_index._data def test_from_pi(self, period_index): pi = period_index @@ -648,10 +713,11 @@ def test_to_timestamp(self, how, period_index): def test_to_timestamp_out_of_bounds(self): # GH#19643 previously overflowed silently pi = pd.period_range("1500", freq="Y", periods=3) - with pytest.raises(OutOfBoundsDatetime): + msg = "Out of bounds nanosecond timestamp: 1500-01-01 00:00:00" + with pytest.raises(OutOfBoundsDatetime, match=msg): pi.to_timestamp() - with pytest.raises(OutOfBoundsDatetime): + with pytest.raises(OutOfBoundsDatetime, match=msg): pi._data.to_timestamp() @pytest.mark.parametrize("propname", PeriodArray._bool_ops) @@ -687,11 +753,12 @@ def test_array_interface(self, period_index): result = np.asarray(arr, dtype=object) tm.assert_numpy_array_equal(result, expected) - # to other dtypes - with pytest.raises(TypeError): - np.asarray(arr, dtype="int64") + result = np.asarray(arr, dtype="int64") + tm.assert_numpy_array_equal(result, arr.asi8) - with pytest.raises(TypeError): + # to other dtypes + msg = r"float\(\) argument must be a string or a number, not 'Period'" + with pytest.raises(TypeError, match=msg): np.asarray(arr, dtype="float64") result = np.asarray(arr, dtype="S20") @@ -757,8 +824,13 @@ def test_casting_nat_setitem_array(array, casting_nats): ids=lambda x: type(x).__name__, ) def test_invalid_nat_setitem_array(array, non_casting_nats): + msg = ( + "'value' should be a '(Timestamp|Timedelta|Period)', 'NaT', or array of those. " + "Got '(timedelta64|datetime64|int)' instead." + ) + for nat in non_casting_nats: - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): array[0] = nat @@ -795,3 +867,48 @@ def test_to_numpy_extra(array): assert result[0] == result[1] tm.assert_equal(array, original) + + +@pytest.mark.parametrize( + "values", + [ + pd.to_datetime(["2020-01-01", "2020-02-01"]), + pd.TimedeltaIndex([1, 2], unit="D"), + pd.PeriodIndex(["2020-01-01", "2020-02-01"], freq="D"), + ], +) +@pytest.mark.parametrize("klass", [list, np.array, pd.array, pd.Series]) +def test_searchsorted_datetimelike_with_listlike(values, klass): + # https://github.com/pandas-dev/pandas/issues/32762 + result = values.searchsorted(klass(values)) + expected = np.array([0, 1], dtype=result.dtype) + + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "values", + [ + pd.to_datetime(["2020-01-01", "2020-02-01"]), + pd.TimedeltaIndex([1, 2], unit="D"), + pd.PeriodIndex(["2020-01-01", "2020-02-01"], freq="D"), + ], +) +@pytest.mark.parametrize( + "arg", [[1, 2], ["a", "b"], [pd.Timestamp("2020-01-01", tz="Europe/London")] * 2] +) +def test_searchsorted_datetimelike_with_listlike_invalid_dtype(values, arg): + # https://github.com/pandas-dev/pandas/issues/32762 + msg = "[Unexpected type|Cannot compare]" + with pytest.raises(TypeError, match=msg): + values.searchsorted(arg) + + +@pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series]) +def test_period_index_construction_from_strings(klass): + # https://github.com/pandas-dev/pandas/issues/26109 + strings = ["2020Q1", "2020Q2"] * 2 + data = klass(strings) + result = PeriodIndex(data, freq="Q") + expected = PeriodIndex([Period(s) for s in strings]) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index a59ed429cc404..7d80ad3d8c6be 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -89,11 +89,26 @@ def test_non_array_raises(self): with pytest.raises(ValueError, match="list"): DatetimeArray([1, 2, 3]) - def test_other_type_raises(self): + def test_bool_dtype_raises(self): + arr = np.array([1, 2, 3], dtype="bool") + with pytest.raises( ValueError, match="The dtype of 'values' is incorrect.*bool" ): - DatetimeArray(np.array([1, 2, 3], dtype="bool")) + DatetimeArray(arr) + + msg = r"dtype bool cannot be converted to datetime64\[ns\]" + with pytest.raises(TypeError, match=msg): + DatetimeArray._from_sequence(arr) + + with pytest.raises(TypeError, match=msg): + sequence_to_dt64ns(arr) + + with pytest.raises(TypeError, match=msg): + pd.DatetimeIndex(arr) + + with pytest.raises(TypeError, match=msg): + pd.to_datetime(arr) def test_incorrect_dtype_raises(self): with pytest.raises(ValueError, match="Unexpected value for 'dtype'."): @@ -151,6 +166,18 @@ def test_astype_to_same(self): result = arr.astype(DatetimeTZDtype(tz="US/Central"), copy=False) assert result is arr + @pytest.mark.parametrize("dtype", ["datetime64[ns]", "datetime64[ns, UTC]"]) + @pytest.mark.parametrize( + "other", ["datetime64[ns]", "datetime64[ns, UTC]", "datetime64[ns, CET]"] + ) + def test_astype_copies(self, dtype, other): + # https://github.com/pandas-dev/pandas/pull/32490 + s = pd.Series([1, 2], dtype=dtype) + orig = s.copy() + t = s.astype(other) + t[:] = pd.NaT + tm.assert_series_equal(s, orig) + @pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"]) def test_astype_int(self, dtype): arr = DatetimeArray._from_sequence([pd.Timestamp("2000"), pd.Timestamp("2001")]) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py deleted file mode 100644 index 0a5a2362bd290..0000000000000 --- a/pandas/tests/arrays/test_integer.py +++ /dev/null @@ -1,1116 +0,0 @@ -import numpy as np -import pytest - -import pandas.util._test_decorators as td - -from pandas.core.dtypes.generic import ABCIndexClass - -import pandas as pd -import pandas._testing as tm -from pandas.api.types import is_float, is_float_dtype, is_integer, is_scalar -from pandas.core.arrays import IntegerArray, integer_array -from pandas.core.arrays.integer import ( - Int8Dtype, - Int16Dtype, - Int32Dtype, - Int64Dtype, - UInt8Dtype, - UInt16Dtype, - UInt32Dtype, - UInt64Dtype, -) -from pandas.tests.extension.base import BaseOpsUtil - - -def make_data(): - return list(range(8)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100] - - -@pytest.fixture( - params=[ - Int8Dtype, - Int16Dtype, - Int32Dtype, - Int64Dtype, - UInt8Dtype, - UInt16Dtype, - UInt32Dtype, - UInt64Dtype, - ] -) -def dtype(request): - return request.param() - - -@pytest.fixture -def data(dtype): - return integer_array(make_data(), dtype=dtype) - - -@pytest.fixture -def data_missing(dtype): - return integer_array([np.nan, 1], dtype=dtype) - - -@pytest.fixture(params=["data", "data_missing"]) -def all_data(request, data, data_missing): - """Parametrized fixture giving 'data' and 'data_missing'""" - if request.param == "data": - return data - elif request.param == "data_missing": - return data_missing - - -def test_dtypes(dtype): - # smoke tests on auto dtype construction - - if dtype.is_signed_integer: - assert np.dtype(dtype.type).kind == "i" - else: - assert np.dtype(dtype.type).kind == "u" - assert dtype.name is not None - - -@pytest.mark.parametrize( - "dtype, expected", - [ - (Int8Dtype(), "Int8Dtype()"), - (Int16Dtype(), "Int16Dtype()"), - (Int32Dtype(), "Int32Dtype()"), - (Int64Dtype(), "Int64Dtype()"), - (UInt8Dtype(), "UInt8Dtype()"), - (UInt16Dtype(), "UInt16Dtype()"), - (UInt32Dtype(), "UInt32Dtype()"), - (UInt64Dtype(), "UInt64Dtype()"), - ], -) -def test_repr_dtype(dtype, expected): - assert repr(dtype) == expected - - -def test_repr_array(): - result = repr(integer_array([1, None, 3])) - expected = "\n[1, , 3]\nLength: 3, dtype: Int64" - assert result == expected - - -def test_repr_array_long(): - data = integer_array([1, 2, None] * 1000) - expected = ( - "\n" - "[ 1, 2, , 1, 2, , 1, 2, , 1,\n" - " ...\n" - " , 1, 2, , 1, 2, , 1, 2, ]\n" - "Length: 3000, dtype: Int64" - ) - result = repr(data) - assert result == expected - - -class TestConstructors: - def test_uses_pandas_na(self): - a = pd.array([1, None], dtype=pd.Int64Dtype()) - assert a[1] is pd.NA - - def test_from_dtype_from_float(self, data): - # construct from our dtype & string dtype - dtype = data.dtype - - # from float - expected = pd.Series(data) - result = pd.Series( - data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype) - ) - tm.assert_series_equal(result, expected) - - # from int / list - expected = pd.Series(data) - result = pd.Series(np.array(data).tolist(), dtype=str(dtype)) - tm.assert_series_equal(result, expected) - - # from int / array - expected = pd.Series(data).dropna().reset_index(drop=True) - dropped = np.array(data.dropna()).astype(np.dtype((dtype.type))) - result = pd.Series(dropped, dtype=str(dtype)) - tm.assert_series_equal(result, expected) - - -class TestArithmeticOps(BaseOpsUtil): - def _check_divmod_op(self, s, op, other, exc=None): - super()._check_divmod_op(s, op, other, None) - - def _check_op(self, s, op_name, other, exc=None): - op = self.get_op_from_name(op_name) - result = op(s, other) - - # compute expected - mask = s.isna() - - # if s is a DataFrame, squeeze to a Series - # for comparison - if isinstance(s, pd.DataFrame): - result = result.squeeze() - s = s.squeeze() - mask = mask.squeeze() - - # other array is an Integer - if isinstance(other, IntegerArray): - omask = getattr(other, "mask", None) - mask = getattr(other, "data", other) - if omask is not None: - mask |= omask - - # 1 ** na is na, so need to unmask those - if op_name == "__pow__": - mask = np.where(~s.isna() & (s == 1), False, mask) - - elif op_name == "__rpow__": - other_is_one = other == 1 - if isinstance(other_is_one, pd.Series): - other_is_one = other_is_one.fillna(False) - mask = np.where(other_is_one, False, mask) - - # float result type or float op - if ( - is_float_dtype(other) - or is_float(other) - or op_name in ["__rtruediv__", "__truediv__", "__rdiv__", "__div__"] - ): - rs = s.astype("float") - expected = op(rs, other) - self._check_op_float(result, expected, mask, s, op_name, other) - - # integer result type - else: - rs = pd.Series(s.values._data, name=s.name) - expected = op(rs, other) - self._check_op_integer(result, expected, mask, s, op_name, other) - - def _check_op_float(self, result, expected, mask, s, op_name, other): - # check comparisons that are resulting in float dtypes - - expected[mask] = np.nan - if "floordiv" in op_name: - # Series op sets 1//0 to np.inf, which IntegerArray does not do (yet) - mask2 = np.isinf(expected) & np.isnan(result) - expected[mask2] = np.nan - tm.assert_series_equal(result, expected) - - def _check_op_integer(self, result, expected, mask, s, op_name, other): - # check comparisons that are resulting in integer dtypes - - # to compare properly, we convert the expected - # to float, mask to nans and convert infs - # if we have uints then we process as uints - # then convert to float - # and we ultimately want to create a IntArray - # for comparisons - - fill_value = 0 - - # mod/rmod turn floating 0 into NaN while - # integer works as expected (no nan) - if op_name in ["__mod__", "__rmod__"]: - if is_scalar(other): - if other == 0: - expected[s.values == 0] = 0 - else: - expected = expected.fillna(0) - else: - expected[ - (s.values == 0).fillna(False) - & ((expected == 0).fillna(False) | expected.isna()) - ] = 0 - try: - expected[ - ((expected == np.inf) | (expected == -np.inf)).fillna(False) - ] = fill_value - original = expected - expected = expected.astype(s.dtype) - - except ValueError: - - expected = expected.astype(float) - expected[ - ((expected == np.inf) | (expected == -np.inf)).fillna(False) - ] = fill_value - original = expected - expected = expected.astype(s.dtype) - - expected[mask] = pd.NA - - # assert that the expected astype is ok - # (skip for unsigned as they have wrap around) - if not s.dtype.is_unsigned_integer: - original = pd.Series(original) - - # we need to fill with 0's to emulate what an astype('int') does - # (truncation) for certain ops - if op_name in ["__rtruediv__", "__rdiv__"]: - mask |= original.isna() - original = original.fillna(0).astype("int") - - original = original.astype("float") - original[mask] = np.nan - tm.assert_series_equal(original, expected.astype("float")) - - # assert our expected result - tm.assert_series_equal(result, expected) - - def test_arith_integer_array(self, data, all_arithmetic_operators): - # we operate with a rhs of an integer array - - op = all_arithmetic_operators - - s = pd.Series(data) - rhs = pd.Series([1] * len(data), dtype=data.dtype) - rhs.iloc[-1] = np.nan - - self._check_op(s, op, rhs) - - def test_arith_series_with_scalar(self, data, all_arithmetic_operators): - # scalar - op = all_arithmetic_operators - s = pd.Series(data) - self._check_op(s, op, 1, exc=TypeError) - - def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): - # frame & scalar - op = all_arithmetic_operators - df = pd.DataFrame({"A": data}) - self._check_op(df, op, 1, exc=TypeError) - - def test_arith_series_with_array(self, data, all_arithmetic_operators): - # ndarray & other series - op = all_arithmetic_operators - s = pd.Series(data) - other = np.ones(len(s), dtype=s.dtype.type) - self._check_op(s, op, other, exc=TypeError) - - def test_arith_coerce_scalar(self, data, all_arithmetic_operators): - - op = all_arithmetic_operators - s = pd.Series(data) - - other = 0.01 - self._check_op(s, op, other) - - @pytest.mark.parametrize("other", [1.0, np.array(1.0)]) - def test_arithmetic_conversion(self, all_arithmetic_operators, other): - # if we have a float operand we should have a float result - # if that is equal to an integer - op = self.get_op_from_name(all_arithmetic_operators) - - s = pd.Series([1, 2, 3], dtype="Int64") - result = op(s, other) - assert result.dtype is np.dtype("float") - - def test_arith_len_mismatch(self, all_arithmetic_operators): - # operating with a list-like with non-matching length raises - op = self.get_op_from_name(all_arithmetic_operators) - other = np.array([1.0]) - - s = pd.Series([1, 2, 3], dtype="Int64") - with pytest.raises(ValueError, match="Lengths must match"): - op(s, other) - - @pytest.mark.parametrize("other", [0, 0.5]) - def test_arith_zero_dim_ndarray(self, other): - arr = integer_array([1, None, 2]) - result = arr + np.array(other) - expected = arr + other - tm.assert_equal(result, expected) - - def test_error(self, data, all_arithmetic_operators): - # invalid ops - - op = all_arithmetic_operators - s = pd.Series(data) - ops = getattr(s, op) - opa = getattr(data, op) - - # invalid scalars - msg = ( - r"(:?can only perform ops with numeric values)" - r"|(:?IntegerArray cannot perform the operation mod)" - ) - with pytest.raises(TypeError, match=msg): - ops("foo") - with pytest.raises(TypeError, match=msg): - ops(pd.Timestamp("20180101")) - - # invalid array-likes - with pytest.raises(TypeError, match=msg): - ops(pd.Series("foo", index=s.index)) - - if op != "__rpow__": - # TODO(extension) - # rpow with a datetimelike coerces the integer array incorrectly - msg = ( - "can only perform ops with numeric values|" - "cannot perform .* with this index type: DatetimeArray|" - "Addition/subtraction of integers and integer-arrays " - "with DatetimeArray is no longer supported. *" - ) - with pytest.raises(TypeError, match=msg): - ops(pd.Series(pd.date_range("20180101", periods=len(s)))) - - # 2d - result = opa(pd.DataFrame({"A": s})) - assert result is NotImplemented - - msg = r"can only perform ops with 1-d structures" - with pytest.raises(NotImplementedError, match=msg): - opa(np.arange(len(s)).reshape(-1, len(s))) - - @pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)]) - def test_divide_by_zero(self, zero, negative): - # https://github.com/pandas-dev/pandas/issues/27398 - a = pd.array([0, 1, -1, None], dtype="Int64") - result = a / zero - expected = np.array([np.nan, np.inf, -np.inf, np.nan]) - if negative: - expected *= -1 - tm.assert_numpy_array_equal(result, expected) - - def test_pow_scalar(self): - a = pd.array([-1, 0, 1, None, 2], dtype="Int64") - result = a ** 0 - expected = pd.array([1, 1, 1, 1, 1], dtype="Int64") - tm.assert_extension_array_equal(result, expected) - - result = a ** 1 - expected = pd.array([-1, 0, 1, None, 2], dtype="Int64") - tm.assert_extension_array_equal(result, expected) - - result = a ** pd.NA - expected = pd.array([None, None, 1, None, None], dtype="Int64") - tm.assert_extension_array_equal(result, expected) - - result = a ** np.nan - expected = np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64") - tm.assert_numpy_array_equal(result, expected) - - # reversed - a = a[1:] # Can't raise integers to negative powers. - - result = 0 ** a - expected = pd.array([1, 0, None, 0], dtype="Int64") - tm.assert_extension_array_equal(result, expected) - - result = 1 ** a - expected = pd.array([1, 1, 1, 1], dtype="Int64") - tm.assert_extension_array_equal(result, expected) - - result = pd.NA ** a - expected = pd.array([1, None, None, None], dtype="Int64") - tm.assert_extension_array_equal(result, expected) - - result = np.nan ** a - expected = np.array([1, np.nan, np.nan, np.nan], dtype="float64") - tm.assert_numpy_array_equal(result, expected) - - def test_pow_array(self): - a = integer_array([0, 0, 0, 1, 1, 1, None, None, None]) - b = integer_array([0, 1, None, 0, 1, None, 0, 1, None]) - result = a ** b - expected = integer_array([1, 0, None, 1, 1, 1, 1, None, None]) - tm.assert_extension_array_equal(result, expected) - - def test_rpow_one_to_na(self): - # https://github.com/pandas-dev/pandas/issues/22022 - # https://github.com/pandas-dev/pandas/issues/29997 - arr = integer_array([np.nan, np.nan]) - result = np.array([1.0, 2.0]) ** arr - expected = np.array([1.0, np.nan]) - tm.assert_numpy_array_equal(result, expected) - - -class TestComparisonOps(BaseOpsUtil): - def _compare_other(self, data, op_name, other): - op = self.get_op_from_name(op_name) - - # array - result = pd.Series(op(data, other)) - expected = pd.Series(op(data._data, other), dtype="boolean") - - # fill the nan locations - expected[data._mask] = pd.NA - - tm.assert_series_equal(result, expected) - - # series - s = pd.Series(data) - result = op(s, other) - - expected = op(pd.Series(data._data), other) - - # fill the nan locations - expected[data._mask] = pd.NA - expected = expected.astype("boolean") - - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("other", [True, False, pd.NA, -1, 0, 1]) - def test_scalar(self, other, all_compare_operators): - op = self.get_op_from_name(all_compare_operators) - a = pd.array([1, 0, None], dtype="Int64") - - result = op(a, other) - - if other is pd.NA: - expected = pd.array([None, None, None], dtype="boolean") - else: - values = op(a._data, other) - expected = pd.arrays.BooleanArray(values, a._mask, copy=True) - tm.assert_extension_array_equal(result, expected) - - # ensure we haven't mutated anything inplace - result[0] = pd.NA - tm.assert_extension_array_equal(a, pd.array([1, 0, None], dtype="Int64")) - - def test_array(self, all_compare_operators): - op = self.get_op_from_name(all_compare_operators) - a = pd.array([0, 1, 2, None, None, None], dtype="Int64") - b = pd.array([0, 1, None, 0, 1, None], dtype="Int64") - - result = op(a, b) - values = op(a._data, b._data) - mask = a._mask | b._mask - - expected = pd.arrays.BooleanArray(values, mask) - tm.assert_extension_array_equal(result, expected) - - # ensure we haven't mutated anything inplace - result[0] = pd.NA - tm.assert_extension_array_equal( - a, pd.array([0, 1, 2, None, None, None], dtype="Int64") - ) - tm.assert_extension_array_equal( - b, pd.array([0, 1, None, 0, 1, None], dtype="Int64") - ) - - def test_compare_with_booleanarray(self, all_compare_operators): - op = self.get_op_from_name(all_compare_operators) - a = pd.array([True, False, None] * 3, dtype="boolean") - b = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype="Int64") - other = pd.array([False] * 3 + [True] * 3 + [None] * 3, dtype="boolean") - expected = op(a, other) - result = op(a, b) - tm.assert_extension_array_equal(result, expected) - - def test_no_shared_mask(self, data): - result = data + 1 - assert np.shares_memory(result._mask, data._mask) is False - - def test_compare_to_string(self, any_nullable_int_dtype): - # GH 28930 - s = pd.Series([1, None], dtype=any_nullable_int_dtype) - result = s == "a" - expected = pd.Series([False, pd.NA], dtype="boolean") - - self.assert_series_equal(result, expected) - - def test_compare_to_int(self, any_nullable_int_dtype, all_compare_operators): - # GH 28930 - s1 = pd.Series([1, None, 3], dtype=any_nullable_int_dtype) - s2 = pd.Series([1, None, 3], dtype="float") - - method = getattr(s1, all_compare_operators) - result = method(2) - - method = getattr(s2, all_compare_operators) - expected = method(2).astype("boolean") - expected[s2.isna()] = pd.NA - - self.assert_series_equal(result, expected) - - -class TestCasting: - @pytest.mark.parametrize("dropna", [True, False]) - def test_construct_index(self, all_data, dropna): - # ensure that we do not coerce to Float64Index, rather - # keep as Index - - all_data = all_data[:10] - if dropna: - other = np.array(all_data[~all_data.isna()]) - else: - other = all_data - - result = pd.Index(integer_array(other, dtype=all_data.dtype)) - expected = pd.Index(other, dtype=object) - - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize("dropna", [True, False]) - def test_astype_index(self, all_data, dropna): - # as an int/uint index to Index - - all_data = all_data[:10] - if dropna: - other = all_data[~all_data.isna()] - else: - other = all_data - - dtype = all_data.dtype - idx = pd.Index(np.array(other)) - assert isinstance(idx, ABCIndexClass) - - result = idx.astype(dtype) - expected = idx.astype(object).astype(dtype) - tm.assert_index_equal(result, expected) - - def test_astype(self, all_data): - all_data = all_data[:10] - - ints = all_data[~all_data.isna()] - mixed = all_data - dtype = Int8Dtype() - - # coerce to same type - ints - s = pd.Series(ints) - result = s.astype(all_data.dtype) - expected = pd.Series(ints) - tm.assert_series_equal(result, expected) - - # coerce to same other - ints - s = pd.Series(ints) - result = s.astype(dtype) - expected = pd.Series(ints, dtype=dtype) - tm.assert_series_equal(result, expected) - - # coerce to same numpy_dtype - ints - s = pd.Series(ints) - result = s.astype(all_data.dtype.numpy_dtype) - expected = pd.Series(ints._data.astype(all_data.dtype.numpy_dtype)) - tm.assert_series_equal(result, expected) - - # coerce to same type - mixed - s = pd.Series(mixed) - result = s.astype(all_data.dtype) - expected = pd.Series(mixed) - tm.assert_series_equal(result, expected) - - # coerce to same other - mixed - s = pd.Series(mixed) - result = s.astype(dtype) - expected = pd.Series(mixed, dtype=dtype) - tm.assert_series_equal(result, expected) - - # coerce to same numpy_dtype - mixed - s = pd.Series(mixed) - msg = r"cannot convert to .*-dtype NumPy array with missing values.*" - with pytest.raises(ValueError, match=msg): - s.astype(all_data.dtype.numpy_dtype) - - # coerce to object - s = pd.Series(mixed) - result = s.astype("object") - expected = pd.Series(np.asarray(mixed)) - tm.assert_series_equal(result, expected) - - def test_astype_to_larger_numpy(self): - a = pd.array([1, 2], dtype="Int32") - result = a.astype("int64") - expected = np.array([1, 2], dtype="int64") - tm.assert_numpy_array_equal(result, expected) - - a = pd.array([1, 2], dtype="UInt32") - result = a.astype("uint64") - expected = np.array([1, 2], dtype="uint64") - tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.parametrize("dtype", [Int8Dtype(), "Int8", UInt32Dtype(), "UInt32"]) - def test_astype_specific_casting(self, dtype): - s = pd.Series([1, 2, 3], dtype="Int64") - result = s.astype(dtype) - expected = pd.Series([1, 2, 3], dtype=dtype) - tm.assert_series_equal(result, expected) - - s = pd.Series([1, 2, 3, None], dtype="Int64") - result = s.astype(dtype) - expected = pd.Series([1, 2, 3, None], dtype=dtype) - tm.assert_series_equal(result, expected) - - def test_construct_cast_invalid(self, dtype): - - msg = "cannot safely" - arr = [1.2, 2.3, 3.7] - with pytest.raises(TypeError, match=msg): - integer_array(arr, dtype=dtype) - - with pytest.raises(TypeError, match=msg): - pd.Series(arr).astype(dtype) - - arr = [1.2, 2.3, 3.7, np.nan] - with pytest.raises(TypeError, match=msg): - integer_array(arr, dtype=dtype) - - with pytest.raises(TypeError, match=msg): - pd.Series(arr).astype(dtype) - - @pytest.mark.parametrize("in_series", [True, False]) - def test_to_numpy_na_nan(self, in_series): - a = pd.array([0, 1, None], dtype="Int64") - if in_series: - a = pd.Series(a) - - result = a.to_numpy(dtype="float64", na_value=np.nan) - expected = np.array([0.0, 1.0, np.nan], dtype="float64") - tm.assert_numpy_array_equal(result, expected) - - result = a.to_numpy(dtype="int64", na_value=-1) - expected = np.array([0, 1, -1], dtype="int64") - tm.assert_numpy_array_equal(result, expected) - - result = a.to_numpy(dtype="bool", na_value=False) - expected = np.array([False, True, False], dtype="bool") - tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.parametrize("in_series", [True, False]) - @pytest.mark.parametrize("dtype", ["int32", "int64", "bool"]) - def test_to_numpy_dtype(self, dtype, in_series): - a = pd.array([0, 1], dtype="Int64") - if in_series: - a = pd.Series(a) - - result = a.to_numpy(dtype=dtype) - expected = np.array([0, 1], dtype=dtype) - tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.parametrize("dtype", ["float64", "int64", "bool"]) - def test_to_numpy_na_raises(self, dtype): - a = pd.array([0, 1, None], dtype="Int64") - with pytest.raises(ValueError, match=dtype): - a.to_numpy(dtype=dtype) - - def test_astype_str(self): - a = pd.array([1, 2, None], dtype="Int64") - expected = np.array(["1", "2", ""], dtype=object) - - tm.assert_numpy_array_equal(a.astype(str), expected) - tm.assert_numpy_array_equal(a.astype("str"), expected) - - def test_astype_boolean(self): - # https://github.com/pandas-dev/pandas/issues/31102 - a = pd.array([1, 0, -1, 2, None], dtype="Int64") - result = a.astype("boolean") - expected = pd.array([True, False, True, True, None], dtype="boolean") - tm.assert_extension_array_equal(result, expected) - - -def test_frame_repr(data_missing): - - df = pd.DataFrame({"A": data_missing}) - result = repr(df) - expected = " A\n0 \n1 1" - assert result == expected - - -def test_conversions(data_missing): - - # astype to object series - df = pd.DataFrame({"A": data_missing}) - result = df["A"].astype("object") - expected = pd.Series(np.array([np.nan, 1], dtype=object), name="A") - tm.assert_series_equal(result, expected) - - # convert to object ndarray - # we assert that we are exactly equal - # including type conversions of scalars - result = df["A"].astype("object").values - expected = np.array([pd.NA, 1], dtype=object) - tm.assert_numpy_array_equal(result, expected) - - for r, e in zip(result, expected): - if pd.isnull(r): - assert pd.isnull(e) - elif is_integer(r): - assert r == e - assert is_integer(e) - else: - assert r == e - assert type(r) == type(e) - - -def test_integer_array_constructor(): - values = np.array([1, 2, 3, 4], dtype="int64") - mask = np.array([False, False, False, True], dtype="bool") - - result = IntegerArray(values, mask) - expected = integer_array([1, 2, 3, np.nan], dtype="int64") - tm.assert_extension_array_equal(result, expected) - - msg = r".* should be .* numpy array. Use the 'integer_array' function instead" - with pytest.raises(TypeError, match=msg): - IntegerArray(values.tolist(), mask) - - with pytest.raises(TypeError, match=msg): - IntegerArray(values, mask.tolist()) - - with pytest.raises(TypeError, match=msg): - IntegerArray(values.astype(float), mask) - msg = r"__init__\(\) missing 1 required positional argument: 'mask'" - with pytest.raises(TypeError, match=msg): - IntegerArray(values) - - -@pytest.mark.parametrize( - "a, b", - [ - ([1, None], [1, np.nan]), - ([None], [np.nan]), - ([None, np.nan], [np.nan, np.nan]), - ([np.nan, np.nan], [np.nan, np.nan]), - ], -) -def test_integer_array_constructor_none_is_nan(a, b): - result = integer_array(a) - expected = integer_array(b) - tm.assert_extension_array_equal(result, expected) - - -def test_integer_array_constructor_copy(): - values = np.array([1, 2, 3, 4], dtype="int64") - mask = np.array([False, False, False, True], dtype="bool") - - result = IntegerArray(values, mask) - assert result._data is values - assert result._mask is mask - - result = IntegerArray(values, mask, copy=True) - assert result._data is not values - assert result._mask is not mask - - -@pytest.mark.parametrize( - "values", - [ - ["foo", "bar"], - ["1", "2"], - "foo", - 1, - 1.0, - pd.date_range("20130101", periods=2), - np.array(["foo"]), - [[1, 2], [3, 4]], - [np.nan, {"a": 1}], - ], -) -def test_to_integer_array_error(values): - # error in converting existing arrays to IntegerArrays - msg = ( - r"(:?.* cannot be converted to an IntegerDtype)" - r"|(:?values must be a 1D list-like)" - ) - with pytest.raises(TypeError, match=msg): - integer_array(values) - - -def test_to_integer_array_inferred_dtype(): - # if values has dtype -> respect it - result = integer_array(np.array([1, 2], dtype="int8")) - assert result.dtype == Int8Dtype() - result = integer_array(np.array([1, 2], dtype="int32")) - assert result.dtype == Int32Dtype() - - # if values have no dtype -> always int64 - result = integer_array([1, 2]) - assert result.dtype == Int64Dtype() - - -def test_to_integer_array_dtype_keyword(): - result = integer_array([1, 2], dtype="int8") - assert result.dtype == Int8Dtype() - - # if values has dtype -> override it - result = integer_array(np.array([1, 2], dtype="int8"), dtype="int32") - assert result.dtype == Int32Dtype() - - -def test_to_integer_array_float(): - result = integer_array([1.0, 2.0]) - expected = integer_array([1, 2]) - tm.assert_extension_array_equal(result, expected) - - with pytest.raises(TypeError, match="cannot safely cast non-equivalent"): - integer_array([1.5, 2.0]) - - # for float dtypes, the itemsize is not preserved - result = integer_array(np.array([1.0, 2.0], dtype="float32")) - assert result.dtype == Int64Dtype() - - -@pytest.mark.parametrize( - "bool_values, int_values, target_dtype, expected_dtype", - [ - ([False, True], [0, 1], Int64Dtype(), Int64Dtype()), - ([False, True], [0, 1], "Int64", Int64Dtype()), - ([False, True, np.nan], [0, 1, np.nan], Int64Dtype(), Int64Dtype()), - ], -) -def test_to_integer_array_bool(bool_values, int_values, target_dtype, expected_dtype): - result = integer_array(bool_values, dtype=target_dtype) - assert result.dtype == expected_dtype - expected = integer_array(int_values, dtype=target_dtype) - tm.assert_extension_array_equal(result, expected) - - -@pytest.mark.parametrize( - "values, to_dtype, result_dtype", - [ - (np.array([1], dtype="int64"), None, Int64Dtype), - (np.array([1, np.nan]), None, Int64Dtype), - (np.array([1, np.nan]), "int8", Int8Dtype), - ], -) -def test_to_integer_array(values, to_dtype, result_dtype): - # convert existing arrays to IntegerArrays - result = integer_array(values, dtype=to_dtype) - assert result.dtype == result_dtype() - expected = integer_array(values, dtype=result_dtype()) - tm.assert_extension_array_equal(result, expected) - - -def test_cross_type_arithmetic(): - - df = pd.DataFrame( - { - "A": pd.Series([1, 2, np.nan], dtype="Int64"), - "B": pd.Series([1, np.nan, 3], dtype="UInt8"), - "C": [1, 2, 3], - } - ) - - result = df.A + df.C - expected = pd.Series([2, 4, np.nan], dtype="Int64") - tm.assert_series_equal(result, expected) - - result = (df.A + df.C) * 3 == 12 - expected = pd.Series([False, True, None], dtype="boolean") - tm.assert_series_equal(result, expected) - - result = df.A + df.B - expected = pd.Series([2, np.nan, np.nan], dtype="Int64") - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("op", ["sum", "min", "max", "prod"]) -def test_preserve_dtypes(op): - # TODO(#22346): preserve Int64 dtype - # for ops that enable (mean would actually work here - # but generally it is a float return value) - df = pd.DataFrame( - { - "A": ["a", "b", "b"], - "B": [1, None, 3], - "C": integer_array([1, None, 3], dtype="Int64"), - } - ) - - # op - result = getattr(df.C, op)() - assert isinstance(result, int) - - # groupby - result = getattr(df.groupby("A"), op)() - - expected = pd.DataFrame( - {"B": np.array([1.0, 3.0]), "C": integer_array([1, 3], dtype="Int64")}, - index=pd.Index(["a", "b"], name="A"), - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("op", ["mean"]) -def test_reduce_to_float(op): - # some reduce ops always return float, even if the result - # is a rounded number - df = pd.DataFrame( - { - "A": ["a", "b", "b"], - "B": [1, None, 3], - "C": integer_array([1, None, 3], dtype="Int64"), - } - ) - - # op - result = getattr(df.C, op)() - assert isinstance(result, float) - - # groupby - result = getattr(df.groupby("A"), op)() - - expected = pd.DataFrame( - {"B": np.array([1.0, 3.0]), "C": integer_array([1, 3], dtype="Int64")}, - index=pd.Index(["a", "b"], name="A"), - ) - tm.assert_frame_equal(result, expected) - - -def test_astype_nansafe(): - # see gh-22343 - arr = integer_array([np.nan, 1, 2], dtype="Int8") - msg = "cannot convert to 'uint32'-dtype NumPy array with missing values." - - with pytest.raises(ValueError, match=msg): - arr.astype("uint32") - - -@pytest.mark.parametrize("ufunc", [np.abs, np.sign]) -# np.sign emits a warning with nans, -@pytest.mark.filterwarnings("ignore:invalid value encountered in sign") -def test_ufuncs_single_int(ufunc): - a = integer_array([1, 2, -3, np.nan]) - result = ufunc(a) - expected = integer_array(ufunc(a.astype(float))) - tm.assert_extension_array_equal(result, expected) - - s = pd.Series(a) - result = ufunc(s) - expected = pd.Series(integer_array(ufunc(a.astype(float)))) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt]) -def test_ufuncs_single_float(ufunc): - a = integer_array([1, 2, -3, np.nan]) - with np.errstate(invalid="ignore"): - result = ufunc(a) - expected = ufunc(a.astype(float)) - tm.assert_numpy_array_equal(result, expected) - - s = pd.Series(a) - with np.errstate(invalid="ignore"): - result = ufunc(s) - expected = ufunc(s.astype(float)) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("ufunc", [np.add, np.subtract]) -def test_ufuncs_binary_int(ufunc): - # two IntegerArrays - a = integer_array([1, 2, -3, np.nan]) - result = ufunc(a, a) - expected = integer_array(ufunc(a.astype(float), a.astype(float))) - tm.assert_extension_array_equal(result, expected) - - # IntegerArray with numpy array - arr = np.array([1, 2, 3, 4]) - result = ufunc(a, arr) - expected = integer_array(ufunc(a.astype(float), arr)) - tm.assert_extension_array_equal(result, expected) - - result = ufunc(arr, a) - expected = integer_array(ufunc(arr, a.astype(float))) - tm.assert_extension_array_equal(result, expected) - - # IntegerArray with scalar - result = ufunc(a, 1) - expected = integer_array(ufunc(a.astype(float), 1)) - tm.assert_extension_array_equal(result, expected) - - result = ufunc(1, a) - expected = integer_array(ufunc(1, a.astype(float))) - tm.assert_extension_array_equal(result, expected) - - -@pytest.mark.parametrize("values", [[0, 1], [0, None]]) -def test_ufunc_reduce_raises(values): - a = integer_array(values) - msg = r"The 'reduce' method is not supported." - with pytest.raises(NotImplementedError, match=msg): - np.add.reduce(a) - - -@td.skip_if_no("pyarrow", min_version="0.15.0") -def test_arrow_array(data): - # protocol added in 0.15.0 - import pyarrow as pa - - arr = pa.array(data) - expected = np.array(data, dtype=object) - expected[data.isna()] = None - expected = pa.array(expected, type=data.dtype.name.lower(), from_pandas=True) - assert arr.equals(expected) - - -@td.skip_if_no("pyarrow", min_version="0.16.0") -def test_arrow_roundtrip(data): - # roundtrip possible from arrow 0.16.0 - import pyarrow as pa - - df = pd.DataFrame({"a": data}) - table = pa.table(df) - assert table.field("a").type == str(data.dtype.numpy_dtype) - result = table.to_pandas() - tm.assert_frame_equal(result, df) - - -@td.skip_if_no("pyarrow", min_version="0.16.0") -def test_arrow_from_arrow_uint(): - # https://github.com/pandas-dev/pandas/issues/31896 - # possible mismatch in types - import pyarrow as pa - - dtype = pd.UInt32Dtype() - result = dtype.__from_arrow__(pa.array([1, 2, 3, 4, None], type="int64")) - expected = pd.array([1, 2, 3, 4, None], dtype="UInt32") - - tm.assert_extension_array_equal(result, expected) - - -@pytest.mark.parametrize( - "pandasmethname, kwargs", - [ - ("var", {"ddof": 0}), - ("var", {"ddof": 1}), - ("kurtosis", {}), - ("skew", {}), - ("sem", {}), - ], -) -def test_stat_method(pandasmethname, kwargs): - s = pd.Series(data=[1, 2, 3, 4, 5, 6, np.nan, np.nan], dtype="Int64") - pandasmeth = getattr(s, pandasmethname) - result = pandasmeth(**kwargs) - s2 = pd.Series(data=[1, 2, 3, 4, 5, 6], dtype="Int64") - pandasmeth = getattr(s2, pandasmethname) - expected = pandasmeth(**kwargs) - assert expected == result - - -def test_value_counts_na(): - arr = pd.array([1, 2, 1, pd.NA], dtype="Int64") - result = arr.value_counts(dropna=False) - expected = pd.Series([2, 1, 1], index=[1, 2, pd.NA], dtype="Int64") - tm.assert_series_equal(result, expected) - - result = arr.value_counts(dropna=True) - expected = pd.Series([2, 1], index=[1, 2], dtype="Int64") - tm.assert_series_equal(result, expected) - - -def test_array_setitem_nullable_boolean_mask(): - # GH 31446 - ser = pd.Series([1, 2], dtype="Int64") - result = ser.where(ser > 1) - expected = pd.Series([pd.NA, 2], dtype="Int64") - tm.assert_series_equal(result, expected) - - -def test_array_setitem(): - # GH 31446 - arr = pd.Series([1, 2], dtype="Int64").array - arr[arr > 1] = 1 - - expected = pd.array([1, 1], dtype="Int64") - tm.assert_extension_array_equal(arr, expected) - - -# TODO(jreback) - these need testing / are broken - -# shift - -# set_index (destroys type) diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index 0b95d3aa19366..d3ced2f1b1f07 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -37,6 +37,7 @@ def test_registered(): ([pd.Period("2017", "D"), None], None, [17167, iNaT]), (pd.Series(pd.date_range("2017", periods=3)), None, [17167, 17168, 17169]), (pd.date_range("2017", periods=3), None, [17167, 17168, 17169]), + (pd.period_range("2017", periods=4, freq="Q"), None, [188, 189, 190, 191]), ], ) def test_period_array_ok(data, freq, expected): diff --git a/pandas/tests/base/common.py b/pandas/tests/base/common.py new file mode 100644 index 0000000000000..b09710a974c2a --- /dev/null +++ b/pandas/tests/base/common.py @@ -0,0 +1,9 @@ +from typing import Any + +from pandas import Index + + +def allow_na_ops(obj: Any) -> bool: + """Whether to skip test cases including NaN""" + is_bool_index = isinstance(obj, Index) and obj.is_boolean() + return not is_bool_index and obj._can_hold_na diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 46fd1551e6170..59f9103072fe9 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -220,34 +220,6 @@ def test_values_consistent(array, expected_type, dtype): tm.assert_equal(l_values, r_values) -@pytest.mark.parametrize( - "array, expected", - [ - (np.array([0, 1], dtype=np.int64), np.array([0, 1], dtype=np.int64)), - (np.array(["0", "1"]), np.array(["0", "1"], dtype=object)), - (pd.Categorical(["a", "a"]), np.array([0, 0], dtype="int8")), - ( - pd.DatetimeIndex(["2017-01-01T00:00:00"]), - np.array(["2017-01-01T00:00:00"], dtype="M8[ns]"), - ), - ( - pd.DatetimeIndex(["2017-01-01T00:00:00"], tz="US/Eastern"), - np.array(["2017-01-01T05:00:00"], dtype="M8[ns]"), - ), - (pd.TimedeltaIndex([10 ** 10]), np.array([10 ** 10], dtype="m8[ns]")), - ( - pd.PeriodIndex(["2017", "2018"], freq="D"), - np.array([17167, 17532], dtype=np.int64), - ), - ], -) -def test_ndarray_values(array, expected): - l_values = pd.Series(array)._ndarray_values - r_values = pd.Index(array)._ndarray_values - tm.assert_numpy_array_equal(l_values, r_values) - tm.assert_numpy_array_equal(l_values, expected) - - @pytest.mark.parametrize("arr", [np.array([1, 2, 3])]) def test_numpy_array(arr): ser = pd.Series(arr) diff --git a/pandas/tests/base/test_drop_duplicates.py b/pandas/tests/base/test_drop_duplicates.py new file mode 100644 index 0000000000000..4032890b4db18 --- /dev/null +++ b/pandas/tests/base/test_drop_duplicates.py @@ -0,0 +1,30 @@ +from datetime import datetime + +import numpy as np + +import pandas as pd +import pandas._testing as tm + + +def test_drop_duplicates_series_vs_dataframe(): + # GH 14192 + df = pd.DataFrame( + { + "a": [1, 1, 1, "one", "one"], + "b": [2, 2, np.nan, np.nan, np.nan], + "c": [3, 3, np.nan, np.nan, "three"], + "d": [1, 2, 3, 4, 4], + "e": [ + datetime(2015, 1, 1), + datetime(2015, 1, 1), + datetime(2015, 2, 1), + pd.NaT, + pd.NaT, + ], + } + ) + for column in df.columns: + for keep in ["first", "last", False]: + dropped_frame = df[[column]].drop_duplicates(keep=keep) + dropped_series = df[column].drop_duplicates(keep=keep) + tm.assert_frame_equal(dropped_frame, dropped_series.to_frame()) diff --git a/pandas/tests/base/test_factorize.py b/pandas/tests/base/test_factorize.py new file mode 100644 index 0000000000000..415a8b7e4362f --- /dev/null +++ b/pandas/tests/base/test_factorize.py @@ -0,0 +1,28 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize("sort", [True, False]) +def test_factorize(index_or_series_obj, sort): + obj = index_or_series_obj + result_codes, result_uniques = obj.factorize(sort=sort) + + constructor = pd.Index + if isinstance(obj, pd.MultiIndex): + constructor = pd.MultiIndex.from_tuples + expected_uniques = constructor(obj.unique()) + + if sort: + expected_uniques = expected_uniques.sort_values() + + # construct an integer ndarray so that + # `expected_uniques.take(expected_codes)` is equal to `obj` + expected_uniques_list = list(expected_uniques) + expected_codes = [expected_uniques_list.index(val) for val in obj] + expected_codes = np.asarray(expected_codes, dtype=np.intp) + + tm.assert_numpy_array_equal(result_codes, expected_codes) + tm.assert_index_equal(result_uniques, expected_uniques) diff --git a/pandas/tests/base/test_fillna.py b/pandas/tests/base/test_fillna.py new file mode 100644 index 0000000000000..5e50a9e2d1c7f --- /dev/null +++ b/pandas/tests/base/test_fillna.py @@ -0,0 +1,70 @@ +""" +Though Index.fillna and Series.fillna has separate impl, +test here to confirm these works as the same +""" + +import numpy as np +import pytest + +from pandas._libs.tslib import iNaT + +from pandas.core.dtypes.common import needs_i8_conversion +from pandas.core.dtypes.generic import ABCMultiIndex + +from pandas import Index +import pandas._testing as tm +from pandas.tests.base.common import allow_na_ops + + +def test_fillna(index_or_series_obj): + # GH 11343 + obj = index_or_series_obj + if isinstance(obj, ABCMultiIndex): + pytest.skip("MultiIndex doesn't support isna") + + # values will not be changed + fill_value = obj.values[0] if len(obj) > 0 else 0 + result = obj.fillna(fill_value) + if isinstance(obj, Index): + tm.assert_index_equal(obj, result) + else: + tm.assert_series_equal(obj, result) + + # check shallow_copied + assert obj is not result + + +@pytest.mark.parametrize("null_obj", [np.nan, None]) +def test_fillna_null(null_obj, index_or_series_obj): + # GH 11343 + obj = index_or_series_obj + klass = type(obj) + + if not allow_na_ops(obj): + pytest.skip(f"{klass} doesn't allow for NA operations") + elif len(obj) < 1: + pytest.skip("Test doesn't make sense on empty data") + elif isinstance(obj, ABCMultiIndex): + pytest.skip(f"MultiIndex can't hold '{null_obj}'") + + values = obj.values + fill_value = values[0] + expected = values.copy() + if needs_i8_conversion(obj): + values[0:2] = iNaT + expected[0:2] = fill_value + else: + values[0:2] = null_obj + expected[0:2] = fill_value + + expected = klass(expected) + obj = klass(values) + + result = obj.fillna(fill_value) + if isinstance(obj, Index): + tm.assert_index_equal(result, expected) + else: + tm.assert_series_equal(result, expected) + + # check shallow_copied + assert obj is not result diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py new file mode 100644 index 0000000000000..6bab60f05ce89 --- /dev/null +++ b/pandas/tests/base/test_misc.py @@ -0,0 +1,204 @@ +import sys + +import numpy as np +import pytest + +from pandas.compat import PYPY + +from pandas.core.dtypes.common import ( + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_object_dtype, +) + +import pandas as pd +from pandas import DataFrame, Index, IntervalIndex, Series +import pandas._testing as tm + + +@pytest.mark.parametrize( + "op_name, op", + [ + ("add", "+"), + ("sub", "-"), + ("mul", "*"), + ("mod", "%"), + ("pow", "**"), + ("truediv", "/"), + ("floordiv", "//"), + ], +) +@pytest.mark.parametrize("klass", [Series, DataFrame]) +def test_binary_ops_docstring(klass, op_name, op): + # not using the all_arithmetic_functions fixture with _get_opstr + # as _get_opstr is used internally in the dynamic implementation of the docstring + operand1 = klass.__name__.lower() + operand2 = "other" + expected_str = " ".join([operand1, op, operand2]) + assert expected_str in getattr(klass, op_name).__doc__ + + # reverse version of the binary ops + expected_str = " ".join([operand2, op, operand1]) + assert expected_str in getattr(klass, "r" + op_name).__doc__ + + +def test_none_comparison(series_with_simple_index): + series = series_with_simple_index + if isinstance(series.index, IntervalIndex): + # IntervalIndex breaks on "series[0] = np.nan" below + pytest.skip("IntervalIndex doesn't support assignment") + if len(series) < 1: + pytest.skip("Test doesn't make sense on empty data") + + # bug brought up by #1079 + # changed from TypeError in 0.17.0 + series[0] = np.nan + + # noinspection PyComparisonWithNone + result = series == None # noqa + assert not result.iat[0] + assert not result.iat[1] + + # noinspection PyComparisonWithNone + result = series != None # noqa + assert result.iat[0] + assert result.iat[1] + + result = None == series # noqa + assert not result.iat[0] + assert not result.iat[1] + + result = None != series # noqa + assert result.iat[0] + assert result.iat[1] + + if is_datetime64_dtype(series) or is_datetime64tz_dtype(series): + # Following DatetimeIndex (and Timestamp) convention, + # inequality comparisons with Series[datetime64] raise + msg = "Invalid comparison" + with pytest.raises(TypeError, match=msg): + None > series + with pytest.raises(TypeError, match=msg): + series > None + else: + result = None > series + assert not result.iat[0] + assert not result.iat[1] + + result = series < None + assert not result.iat[0] + assert not result.iat[1] + + +def test_ndarray_compat_properties(index_or_series_obj): + obj = index_or_series_obj + + # Check that we work. + for p in ["shape", "dtype", "T", "nbytes"]: + assert getattr(obj, p, None) is not None + + # deprecated properties + for p in ["flags", "strides", "itemsize", "base", "data"]: + assert not hasattr(obj, p) + + msg = "can only convert an array of size 1 to a Python scalar" + with pytest.raises(ValueError, match=msg): + obj.item() # len > 1 + + assert obj.ndim == 1 + assert obj.size == len(obj) + + assert Index([1]).item() == 1 + assert Series([1]).item() == 1 + + +@pytest.mark.skipif(PYPY, reason="not relevant for PyPy") +def test_memory_usage(index_or_series_obj): + obj = index_or_series_obj + res = obj.memory_usage() + res_deep = obj.memory_usage(deep=True) + + is_object = is_object_dtype(obj) or ( + isinstance(obj, Series) and is_object_dtype(obj.index) + ) + is_categorical = is_categorical_dtype(obj) or ( + isinstance(obj, Series) and is_categorical_dtype(obj.index) + ) + + if len(obj) == 0: + assert res_deep == res == 0 + elif is_object or is_categorical: + # only deep will pick them up + assert res_deep > res + else: + assert res == res_deep + + # sys.getsizeof will call the .memory_usage with + # deep=True, and add on some GC overhead + diff = res_deep - sys.getsizeof(obj) + assert abs(diff) < 100 + + +def test_memory_usage_components_series(series_with_simple_index): + series = series_with_simple_index + total_usage = series.memory_usage(index=True) + non_index_usage = series.memory_usage(index=False) + index_usage = series.index.memory_usage() + assert total_usage == non_index_usage + index_usage + + +def test_memory_usage_components_narrow_series(narrow_series): + series = narrow_series + total_usage = series.memory_usage(index=True) + non_index_usage = series.memory_usage(index=False) + index_usage = series.index.memory_usage() + assert total_usage == non_index_usage + index_usage + + +def test_searchsorted(index_or_series_obj): + # numpy.searchsorted calls obj.searchsorted under the hood. + # See gh-12238 + obj = index_or_series_obj + + if isinstance(obj, pd.MultiIndex): + # See gh-14833 + pytest.skip("np.searchsorted doesn't work on pd.MultiIndex") + + max_obj = max(obj, default=0) + index = np.searchsorted(obj, max_obj) + assert 0 <= index <= len(obj) + + index = np.searchsorted(obj, max_obj, sorter=range(len(obj))) + assert 0 <= index <= len(obj) + + +def test_access_by_position(indices): + index = indices + + if len(index) == 0: + pytest.skip("Test doesn't make sense on empty data") + elif isinstance(index, pd.MultiIndex): + pytest.skip("Can't instantiate Series from MultiIndex") + + series = pd.Series(index) + assert index[0] == series.iloc[0] + assert index[5] == series.iloc[5] + assert index[-1] == series.iloc[-1] + + size = len(index) + assert index[-1] == index[size - 1] + + msg = f"index {size} is out of bounds for axis 0 with size {size}" + with pytest.raises(IndexError, match=msg): + index[size] + msg = "single positional indexer is out-of-bounds" + with pytest.raises(IndexError, match=msg): + series.iloc[size] + + +def test_get_indexer_non_unique_dtype_mismatch(): + # GH 25459 + indexes, missing = pd.Index(["A", "B"]).get_indexer_non_unique(pd.Index([0])) + tm.assert_numpy_array_equal(np.array([-1], dtype=np.intp), indexes) + tm.assert_numpy_array_equal(np.array([0], dtype=np.int64), missing) diff --git a/pandas/tests/base/test_ops.py b/pandas/tests/base/test_ops.py deleted file mode 100644 index f85d823cb2fac..0000000000000 --- a/pandas/tests/base/test_ops.py +++ /dev/null @@ -1,878 +0,0 @@ -from datetime import datetime, timedelta -from io import StringIO -import sys -from typing import Any - -import numpy as np -import pytest - -from pandas._libs.tslib import iNaT -from pandas.compat import PYPY -from pandas.compat.numpy import np_array_datetime64_compat - -from pandas.core.dtypes.common import ( - is_datetime64_dtype, - is_datetime64tz_dtype, - is_object_dtype, - needs_i8_conversion, -) - -import pandas as pd -from pandas import ( - DataFrame, - DatetimeIndex, - Index, - Interval, - IntervalIndex, - PeriodIndex, - Series, - Timedelta, - TimedeltaIndex, - Timestamp, -) -import pandas._testing as tm - - -def allow_na_ops(obj: Any) -> bool: - """Whether to skip test cases including NaN""" - is_bool_index = isinstance(obj, Index) and obj.is_boolean() - return not is_bool_index and obj._can_hold_na - - -class Ops: - def setup_method(self, method): - self.bool_index = tm.makeBoolIndex(10, name="a") - self.int_index = tm.makeIntIndex(10, name="a") - self.float_index = tm.makeFloatIndex(10, name="a") - self.dt_index = tm.makeDateIndex(10, name="a") - self.dt_tz_index = tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern") - self.period_index = tm.makePeriodIndex(10, name="a") - self.string_index = tm.makeStringIndex(10, name="a") - self.unicode_index = tm.makeUnicodeIndex(10, name="a") - - arr = np.random.randn(10) - self.bool_series = Series(arr, index=self.bool_index, name="a") - self.int_series = Series(arr, index=self.int_index, name="a") - self.float_series = Series(arr, index=self.float_index, name="a") - self.dt_series = Series(arr, index=self.dt_index, name="a") - self.dt_tz_series = self.dt_tz_index.to_series() - self.period_series = Series(arr, index=self.period_index, name="a") - self.string_series = Series(arr, index=self.string_index, name="a") - self.unicode_series = Series(arr, index=self.unicode_index, name="a") - - types = ["bool", "int", "float", "dt", "dt_tz", "period", "string", "unicode"] - self.indexes = [getattr(self, f"{t}_index") for t in types] - self.series = [getattr(self, f"{t}_series") for t in types] - - # To test narrow dtypes, we use narrower *data* elements, not *index* elements - index = self.int_index - self.float32_series = Series(arr.astype(np.float32), index=index, name="a") - - arr_int = np.random.choice(10, size=10, replace=False) - self.int8_series = Series(arr_int.astype(np.int8), index=index, name="a") - self.int16_series = Series(arr_int.astype(np.int16), index=index, name="a") - self.int32_series = Series(arr_int.astype(np.int32), index=index, name="a") - - self.uint8_series = Series(arr_int.astype(np.uint8), index=index, name="a") - self.uint16_series = Series(arr_int.astype(np.uint16), index=index, name="a") - self.uint32_series = Series(arr_int.astype(np.uint32), index=index, name="a") - - nrw_types = ["float32", "int8", "int16", "int32", "uint8", "uint16", "uint32"] - self.narrow_series = [getattr(self, f"{t}_series") for t in nrw_types] - - self.objs = self.indexes + self.series + self.narrow_series - - -@pytest.mark.parametrize( - "op_name, op", - [ - ("add", "+"), - ("sub", "-"), - ("mul", "*"), - ("mod", "%"), - ("pow", "**"), - ("truediv", "/"), - ("floordiv", "//"), - ], -) -@pytest.mark.parametrize("klass", [Series, DataFrame]) -def test_binary_ops(klass, op_name, op): - # not using the all_arithmetic_functions fixture with _get_opstr - # as _get_opstr is used internally in the dynamic implementation of the docstring - operand1 = klass.__name__.lower() - operand2 = "other" - expected_str = " ".join([operand1, op, operand2]) - assert expected_str in getattr(klass, op_name).__doc__ - - # reverse version of the binary ops - expected_str = " ".join([operand2, op, operand1]) - assert expected_str in getattr(klass, "r" + op_name).__doc__ - - -class TestTranspose: - errmsg = "the 'axes' parameter is not supported" - - def test_transpose(self, index_or_series_obj): - obj = index_or_series_obj - tm.assert_equal(obj.transpose(), obj) - - def test_transpose_non_default_axes(self, index_or_series_obj): - obj = index_or_series_obj - with pytest.raises(ValueError, match=self.errmsg): - obj.transpose(1) - with pytest.raises(ValueError, match=self.errmsg): - obj.transpose(axes=1) - - def test_numpy_transpose(self, index_or_series_obj): - obj = index_or_series_obj - tm.assert_equal(np.transpose(obj), obj) - - with pytest.raises(ValueError, match=self.errmsg): - np.transpose(obj, axes=1) - - -class TestIndexOps(Ops): - def setup_method(self, method): - super().setup_method(method) - self.is_valid_objs = self.objs - self.not_valid_objs = [] - - def test_none_comparison(self, series_with_simple_index): - series = series_with_simple_index - if isinstance(series.index, IntervalIndex): - # IntervalIndex breaks on "series[0] = np.nan" below - pytest.skip("IntervalIndex doesn't support assignment") - if len(series) < 1: - pytest.skip("Test doesn't make sense on empty data") - - # bug brought up by #1079 - # changed from TypeError in 0.17.0 - series[0] = np.nan - - # noinspection PyComparisonWithNone - result = series == None # noqa - assert not result.iat[0] - assert not result.iat[1] - - # noinspection PyComparisonWithNone - result = series != None # noqa - assert result.iat[0] - assert result.iat[1] - - result = None == series # noqa - assert not result.iat[0] - assert not result.iat[1] - - result = None != series # noqa - assert result.iat[0] - assert result.iat[1] - - if is_datetime64_dtype(series) or is_datetime64tz_dtype(series): - # Following DatetimeIndex (and Timestamp) convention, - # inequality comparisons with Series[datetime64] raise - msg = "Invalid comparison" - with pytest.raises(TypeError, match=msg): - None > series - with pytest.raises(TypeError, match=msg): - series > None - else: - result = None > series - assert not result.iat[0] - assert not result.iat[1] - - result = series < None - assert not result.iat[0] - assert not result.iat[1] - - def test_ndarray_compat_properties(self, index_or_series_obj): - obj = index_or_series_obj - - # Check that we work. - for p in ["shape", "dtype", "T", "nbytes"]: - assert getattr(obj, p, None) is not None - - # deprecated properties - for p in ["flags", "strides", "itemsize", "base", "data"]: - assert not hasattr(obj, p) - - msg = "can only convert an array of size 1 to a Python scalar" - with pytest.raises(ValueError, match=msg): - obj.item() # len > 1 - - assert obj.ndim == 1 - assert obj.size == len(obj) - - assert Index([1]).item() == 1 - assert Series([1]).item() == 1 - - def test_value_counts_unique_nunique(self, index_or_series_obj): - orig = index_or_series_obj - obj = orig.copy() - klass = type(obj) - values = obj._values - - if orig.duplicated().any(): - pytest.xfail( - "The test implementation isn't flexible enough to deal" - " with duplicated values. This isn't a bug in the" - " application code, but in the test code." - ) - - # create repeated values, 'n'th element is repeated by n+1 times - if isinstance(obj, Index): - expected_index = Index(obj[::-1]) - expected_index.name = None - obj = obj.repeat(range(1, len(obj) + 1)) - else: - expected_index = Index(values[::-1]) - idx = obj.index.repeat(range(1, len(obj) + 1)) - # take-based repeat - indices = np.repeat(np.arange(len(obj)), range(1, len(obj) + 1)) - rep = values.take(indices) - obj = klass(rep, index=idx) - - # check values has the same dtype as the original - assert obj.dtype == orig.dtype - - expected_s = Series( - range(len(orig), 0, -1), index=expected_index, dtype="int64" - ) - - result = obj.value_counts() - tm.assert_series_equal(result, expected_s) - assert result.index.name is None - - result = obj.unique() - if isinstance(obj, Index): - assert isinstance(result, type(obj)) - tm.assert_index_equal(result, orig) - assert result.dtype == orig.dtype - elif is_datetime64tz_dtype(obj): - # datetimetz Series returns array of Timestamp - assert result[0] == orig[0] - for r in result: - assert isinstance(r, Timestamp) - - tm.assert_numpy_array_equal( - result.astype(object), orig._values.astype(object) - ) - else: - tm.assert_numpy_array_equal(result, orig.values) - assert result.dtype == orig.dtype - - # dropna=True would break for MultiIndex - assert obj.nunique(dropna=False) == len(np.unique(obj.values)) - - @pytest.mark.parametrize("null_obj", [np.nan, None]) - def test_value_counts_unique_nunique_null(self, null_obj, index_or_series_obj): - orig = index_or_series_obj - obj = orig.copy() - klass = type(obj) - values = obj._ndarray_values - num_values = len(orig) - - if not allow_na_ops(obj): - pytest.skip("type doesn't allow for NA operations") - elif isinstance(orig, (pd.CategoricalIndex, pd.IntervalIndex)): - pytest.skip(f"values of {klass} cannot be changed") - elif isinstance(orig, pd.MultiIndex): - pytest.skip("MultiIndex doesn't support isna") - elif orig.duplicated().any(): - pytest.xfail( - "The test implementation isn't flexible enough to deal" - " with duplicated values. This isn't a bug in the" - " application code, but in the test code." - ) - - # special assign to the numpy array - if is_datetime64tz_dtype(obj): - if isinstance(obj, DatetimeIndex): - v = obj.asi8 - v[0:2] = iNaT - values = obj._shallow_copy(v) - else: - obj = obj.copy() - obj[0:2] = pd.NaT - values = obj._values - - elif needs_i8_conversion(obj): - values[0:2] = iNaT - values = obj._shallow_copy(values) - else: - values[0:2] = null_obj - - # check values has the same dtype as the original - assert values.dtype == obj.dtype - - # create repeated values, 'n'th element is repeated by n+1 - # times - if isinstance(obj, (DatetimeIndex, PeriodIndex)): - expected_index = obj.copy() - expected_index.name = None - - # attach name to klass - obj = klass(values.repeat(range(1, len(obj) + 1))) - obj.name = "a" - else: - if isinstance(obj, DatetimeIndex): - expected_index = orig._values._shallow_copy(values) - else: - expected_index = Index(values) - expected_index.name = None - obj = obj.repeat(range(1, len(obj) + 1)) - obj.name = "a" - - # check values has the same dtype as the original - assert obj.dtype == orig.dtype - - # check values correctly have NaN - nanloc = np.zeros(len(obj), dtype=np.bool) - nanloc[:3] = True - if isinstance(obj, Index): - tm.assert_numpy_array_equal(pd.isna(obj), nanloc) - else: - exp = Series(nanloc, obj.index, name="a") - tm.assert_series_equal(pd.isna(obj), exp) - - expected_data = list(range(num_values, 2, -1)) - expected_data_na = expected_data.copy() - if expected_data_na: - expected_data_na.append(3) - expected_s_na = Series( - expected_data_na, - index=expected_index[num_values - 1 : 0 : -1], - dtype="int64", - name="a", - ) - expected_s = Series( - expected_data, - index=expected_index[num_values - 1 : 1 : -1], - dtype="int64", - name="a", - ) - - result_s_na = obj.value_counts(dropna=False) - tm.assert_series_equal(result_s_na, expected_s_na) - assert result_s_na.index.name is None - assert result_s_na.name == "a" - result_s = obj.value_counts() - tm.assert_series_equal(obj.value_counts(), expected_s) - assert result_s.index.name is None - assert result_s.name == "a" - - result = obj.unique() - if isinstance(obj, Index): - tm.assert_index_equal(result, Index(values[1:], name="a")) - elif is_datetime64tz_dtype(obj): - # unable to compare NaT / nan - tm.assert_extension_array_equal(result[1:], values[2:]) - assert result[0] is pd.NaT - elif len(obj) > 0: - tm.assert_numpy_array_equal(result[1:], values[2:]) - - assert pd.isna(result[0]) - assert result.dtype == orig.dtype - - assert obj.nunique() == max(0, num_values - 2) - assert obj.nunique(dropna=False) == max(0, num_values - 1) - - def test_value_counts_inferred(self, index_or_series): - klass = index_or_series - s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] - s = klass(s_values) - expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"]) - tm.assert_series_equal(s.value_counts(), expected) - - if isinstance(s, Index): - exp = Index(np.unique(np.array(s_values, dtype=np.object_))) - tm.assert_index_equal(s.unique(), exp) - else: - exp = np.unique(np.array(s_values, dtype=np.object_)) - tm.assert_numpy_array_equal(s.unique(), exp) - - assert s.nunique() == 4 - # don't sort, have to sort after the fact as not sorting is - # platform-dep - hist = s.value_counts(sort=False).sort_values() - expected = Series([3, 1, 4, 2], index=list("acbd")).sort_values() - tm.assert_series_equal(hist, expected) - - # sort ascending - hist = s.value_counts(ascending=True) - expected = Series([1, 2, 3, 4], index=list("cdab")) - tm.assert_series_equal(hist, expected) - - # relative histogram. - hist = s.value_counts(normalize=True) - expected = Series([0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"]) - tm.assert_series_equal(hist, expected) - - def test_value_counts_bins(self, index_or_series): - klass = index_or_series - s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] - s = klass(s_values) - - # bins - msg = "bins argument only works with numeric data" - with pytest.raises(TypeError, match=msg): - s.value_counts(bins=1) - - s1 = Series([1, 1, 2, 3]) - res1 = s1.value_counts(bins=1) - exp1 = Series({Interval(0.997, 3.0): 4}) - tm.assert_series_equal(res1, exp1) - res1n = s1.value_counts(bins=1, normalize=True) - exp1n = Series({Interval(0.997, 3.0): 1.0}) - tm.assert_series_equal(res1n, exp1n) - - if isinstance(s1, Index): - tm.assert_index_equal(s1.unique(), Index([1, 2, 3])) - else: - exp = np.array([1, 2, 3], dtype=np.int64) - tm.assert_numpy_array_equal(s1.unique(), exp) - - assert s1.nunique() == 3 - - # these return the same - res4 = s1.value_counts(bins=4, dropna=True) - intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) - exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) - tm.assert_series_equal(res4, exp4) - - res4 = s1.value_counts(bins=4, dropna=False) - intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) - exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) - tm.assert_series_equal(res4, exp4) - - res4n = s1.value_counts(bins=4, normalize=True) - exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 3, 1, 2])) - tm.assert_series_equal(res4n, exp4n) - - # handle NA's properly - s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"] - s = klass(s_values) - expected = Series([4, 3, 2], index=["b", "a", "d"]) - tm.assert_series_equal(s.value_counts(), expected) - - if isinstance(s, Index): - exp = Index(["a", "b", np.nan, "d"]) - tm.assert_index_equal(s.unique(), exp) - else: - exp = np.array(["a", "b", np.nan, "d"], dtype=object) - tm.assert_numpy_array_equal(s.unique(), exp) - assert s.nunique() == 3 - - s = klass({}) if klass is dict else klass({}, dtype=object) - expected = Series([], dtype=np.int64) - tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) - # returned dtype differs depending on original - if isinstance(s, Index): - tm.assert_index_equal(s.unique(), Index([]), exact=False) - else: - tm.assert_numpy_array_equal(s.unique(), np.array([]), check_dtype=False) - - assert s.nunique() == 0 - - def test_value_counts_datetime64(self, index_or_series): - klass = index_or_series - - # GH 3002, datetime64[ns] - # don't test names though - txt = "\n".join( - [ - "xxyyzz20100101PIE", - "xxyyzz20100101GUM", - "xxyyzz20100101EGG", - "xxyyww20090101EGG", - "foofoo20080909PIE", - "foofoo20080909GUM", - ] - ) - f = StringIO(txt) - df = pd.read_fwf( - f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"] - ) - - s = klass(df["dt"].copy()) - s.name = None - idx = pd.to_datetime( - ["2010-01-01 00:00:00", "2008-09-09 00:00:00", "2009-01-01 00:00:00"] - ) - expected_s = Series([3, 2, 1], index=idx) - tm.assert_series_equal(s.value_counts(), expected_s) - - expected = np_array_datetime64_compat( - ["2010-01-01 00:00:00", "2009-01-01 00:00:00", "2008-09-09 00:00:00"], - dtype="datetime64[ns]", - ) - if isinstance(s, Index): - tm.assert_index_equal(s.unique(), DatetimeIndex(expected)) - else: - tm.assert_numpy_array_equal(s.unique(), expected) - - assert s.nunique() == 3 - - # with NaT - s = df["dt"].copy() - s = klass(list(s.values) + [pd.NaT]) - - result = s.value_counts() - assert result.index.dtype == "datetime64[ns]" - tm.assert_series_equal(result, expected_s) - - result = s.value_counts(dropna=False) - expected_s[pd.NaT] = 1 - tm.assert_series_equal(result, expected_s) - - unique = s.unique() - assert unique.dtype == "datetime64[ns]" - - # numpy_array_equal cannot compare pd.NaT - if isinstance(s, Index): - exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT]) - tm.assert_index_equal(unique, exp_idx) - else: - tm.assert_numpy_array_equal(unique[:3], expected) - assert pd.isna(unique[3]) - - assert s.nunique() == 3 - assert s.nunique(dropna=False) == 4 - - # timedelta64[ns] - td = df.dt - df.dt + timedelta(1) - td = klass(td, name="dt") - - result = td.value_counts() - expected_s = Series([6], index=[Timedelta("1day")], name="dt") - tm.assert_series_equal(result, expected_s) - - expected = TimedeltaIndex(["1 days"], name="dt") - if isinstance(td, Index): - tm.assert_index_equal(td.unique(), expected) - else: - tm.assert_numpy_array_equal(td.unique(), expected.values) - - td2 = timedelta(1) + (df.dt - df.dt) - td2 = klass(td2, name="dt") - result2 = td2.value_counts() - tm.assert_series_equal(result2, expected_s) - - def test_factorize(self): - for orig in self.objs: - o = orig.copy() - - if isinstance(o, Index) and o.is_boolean(): - exp_arr = np.array([0, 1] + [0] * 8, dtype=np.intp) - exp_uniques = o - exp_uniques = Index([False, True]) - else: - exp_arr = np.array(range(len(o)), dtype=np.intp) - exp_uniques = o - codes, uniques = o.factorize() - - tm.assert_numpy_array_equal(codes, exp_arr) - if isinstance(o, Series): - tm.assert_index_equal(uniques, Index(orig), check_names=False) - else: - # factorize explicitly resets name - tm.assert_index_equal(uniques, exp_uniques, check_names=False) - - def test_factorize_repeated(self): - for orig in self.objs: - o = orig.copy() - - # don't test boolean - if isinstance(o, Index) and o.is_boolean(): - continue - - # sort by value, and create duplicates - if isinstance(o, Series): - o = o.sort_values() - n = o.iloc[5:].append(o) - else: - indexer = o.argsort() - o = o.take(indexer) - n = o[5:].append(o) - - exp_arr = np.array( - [5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.intp - ) - codes, uniques = n.factorize(sort=True) - - tm.assert_numpy_array_equal(codes, exp_arr) - if isinstance(o, Series): - tm.assert_index_equal( - uniques, Index(orig).sort_values(), check_names=False - ) - else: - tm.assert_index_equal(uniques, o, check_names=False) - - exp_arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4], np.intp) - codes, uniques = n.factorize(sort=False) - tm.assert_numpy_array_equal(codes, exp_arr) - - if isinstance(o, Series): - expected = Index(o.iloc[5:10].append(o.iloc[:5])) - tm.assert_index_equal(uniques, expected, check_names=False) - else: - expected = o[5:10].append(o[:5]) - tm.assert_index_equal(uniques, expected, check_names=False) - - def test_duplicated_drop_duplicates_index(self): - # GH 4060 - for original in self.objs: - if isinstance(original, Index): - - # special case - if original.is_boolean(): - result = original.drop_duplicates() - expected = Index([False, True], name="a") - tm.assert_index_equal(result, expected) - continue - - # original doesn't have duplicates - expected = np.array([False] * len(original), dtype=bool) - duplicated = original.duplicated() - tm.assert_numpy_array_equal(duplicated, expected) - assert duplicated.dtype == bool - result = original.drop_duplicates() - tm.assert_index_equal(result, original) - assert result is not original - - # has_duplicates - assert not original.has_duplicates - - # create repeated values, 3rd and 5th values are duplicated - idx = original[list(range(len(original))) + [5, 3]] - expected = np.array([False] * len(original) + [True, True], dtype=bool) - duplicated = idx.duplicated() - tm.assert_numpy_array_equal(duplicated, expected) - assert duplicated.dtype == bool - tm.assert_index_equal(idx.drop_duplicates(), original) - - base = [False] * len(idx) - base[3] = True - base[5] = True - expected = np.array(base) - - duplicated = idx.duplicated(keep="last") - tm.assert_numpy_array_equal(duplicated, expected) - assert duplicated.dtype == bool - result = idx.drop_duplicates(keep="last") - tm.assert_index_equal(result, idx[~expected]) - - base = [False] * len(original) + [True, True] - base[3] = True - base[5] = True - expected = np.array(base) - - duplicated = idx.duplicated(keep=False) - tm.assert_numpy_array_equal(duplicated, expected) - assert duplicated.dtype == bool - result = idx.drop_duplicates(keep=False) - tm.assert_index_equal(result, idx[~expected]) - - with pytest.raises( - TypeError, - match=r"drop_duplicates\(\) got an unexpected keyword argument", - ): - idx.drop_duplicates(inplace=True) - - else: - expected = Series( - [False] * len(original), index=original.index, name="a" - ) - tm.assert_series_equal(original.duplicated(), expected) - result = original.drop_duplicates() - tm.assert_series_equal(result, original) - assert result is not original - - idx = original.index[list(range(len(original))) + [5, 3]] - values = original._values[list(range(len(original))) + [5, 3]] - s = Series(values, index=idx, name="a") - - expected = Series( - [False] * len(original) + [True, True], index=idx, name="a" - ) - tm.assert_series_equal(s.duplicated(), expected) - tm.assert_series_equal(s.drop_duplicates(), original) - - base = [False] * len(idx) - base[3] = True - base[5] = True - expected = Series(base, index=idx, name="a") - - tm.assert_series_equal(s.duplicated(keep="last"), expected) - tm.assert_series_equal( - s.drop_duplicates(keep="last"), s[~np.array(base)] - ) - - base = [False] * len(original) + [True, True] - base[3] = True - base[5] = True - expected = Series(base, index=idx, name="a") - - tm.assert_series_equal(s.duplicated(keep=False), expected) - tm.assert_series_equal( - s.drop_duplicates(keep=False), s[~np.array(base)] - ) - - s.drop_duplicates(inplace=True) - tm.assert_series_equal(s, original) - - def test_drop_duplicates_series_vs_dataframe(self): - # GH 14192 - df = pd.DataFrame( - { - "a": [1, 1, 1, "one", "one"], - "b": [2, 2, np.nan, np.nan, np.nan], - "c": [3, 3, np.nan, np.nan, "three"], - "d": [1, 2, 3, 4, 4], - "e": [ - datetime(2015, 1, 1), - datetime(2015, 1, 1), - datetime(2015, 2, 1), - pd.NaT, - pd.NaT, - ], - } - ) - for column in df.columns: - for keep in ["first", "last", False]: - dropped_frame = df[[column]].drop_duplicates(keep=keep) - dropped_series = df[column].drop_duplicates(keep=keep) - tm.assert_frame_equal(dropped_frame, dropped_series.to_frame()) - - def test_fillna(self): - # # GH 11343 - # though Index.fillna and Series.fillna has separate impl, - # test here to confirm these works as the same - - for orig in self.objs: - - o = orig.copy() - values = o.values - - # values will not be changed - result = o.fillna(o.astype(object).values[0]) - if isinstance(o, Index): - tm.assert_index_equal(o, result) - else: - tm.assert_series_equal(o, result) - # check shallow_copied - assert o is not result - - for null_obj in [np.nan, None]: - for orig in self.objs: - o = orig.copy() - klass = type(o) - - if not allow_na_ops(o): - continue - - if needs_i8_conversion(o): - - values = o.astype(object).values - fill_value = values[0] - values[0:2] = pd.NaT - else: - values = o.values.copy() - fill_value = o.values[0] - values[0:2] = null_obj - - expected = [fill_value] * 2 + list(values[2:]) - - expected = klass(expected, dtype=orig.dtype) - o = klass(values) - - # check values has the same dtype as the original - assert o.dtype == orig.dtype - - result = o.fillna(fill_value) - if isinstance(o, Index): - tm.assert_index_equal(result, expected) - else: - tm.assert_series_equal(result, expected) - # check shallow_copied - assert o is not result - - @pytest.mark.skipif(PYPY, reason="not relevant for PyPy") - def test_memory_usage(self): - for o in self.objs: - res = o.memory_usage() - res_deep = o.memory_usage(deep=True) - - if is_object_dtype(o) or ( - isinstance(o, Series) and is_object_dtype(o.index) - ): - # if there are objects, only deep will pick them up - assert res_deep > res - else: - assert res == res_deep - - if isinstance(o, Series): - assert ( - o.memory_usage(index=False) + o.index.memory_usage() - ) == o.memory_usage(index=True) - - # sys.getsizeof will call the .memory_usage with - # deep=True, and add on some GC overhead - diff = res_deep - sys.getsizeof(o) - assert abs(diff) < 100 - - def test_searchsorted(self): - # See gh-12238 - for o in self.objs: - index = np.searchsorted(o, max(o)) - assert 0 <= index <= len(o) - - index = np.searchsorted(o, max(o), sorter=range(len(o))) - assert 0 <= index <= len(o) - - def test_validate_bool_args(self): - invalid_values = [1, "True", [1, 2, 3], 5.0] - - for value in invalid_values: - msg = "expected type bool" - with pytest.raises(ValueError, match=msg): - self.int_series.drop_duplicates(inplace=value) - - def test_getitem(self): - for i in self.indexes: - s = pd.Series(i) - - assert i[0] == s.iloc[0] - assert i[5] == s.iloc[5] - assert i[-1] == s.iloc[-1] - - assert i[-1] == i[9] - - msg = "index 20 is out of bounds for axis 0 with size 10" - with pytest.raises(IndexError, match=msg): - i[20] - msg = "single positional indexer is out-of-bounds" - with pytest.raises(IndexError, match=msg): - s.iloc[20] - - @pytest.mark.parametrize("indexer_klass", [list, pd.Index]) - @pytest.mark.parametrize( - "indexer", - [ - [True] * 10, - [False] * 10, - [True, False, True, True, False, False, True, True, False, True], - ], - ) - def test_bool_indexing(self, indexer_klass, indexer): - # GH 22533 - for idx in self.indexes: - exp_idx = [i for i in range(len(indexer)) if indexer[i]] - tm.assert_index_equal(idx[indexer_klass(indexer)], idx[exp_idx]) - s = pd.Series(idx) - tm.assert_series_equal(s[indexer_klass(indexer)], s.iloc[exp_idx]) - - def test_get_indexer_non_unique_dtype_mismatch(self): - # GH 25459 - indexes, missing = pd.Index(["A", "B"]).get_indexer_non_unique(pd.Index([0])) - tm.assert_numpy_array_equal(np.array([-1], dtype=np.intp), indexes) - tm.assert_numpy_array_equal(np.array([0], dtype=np.int64), missing) diff --git a/pandas/tests/base/test_transpose.py b/pandas/tests/base/test_transpose.py new file mode 100644 index 0000000000000..5ba278368834c --- /dev/null +++ b/pandas/tests/base/test_transpose.py @@ -0,0 +1,27 @@ +import numpy as np +import pytest + +import pandas._testing as tm + + +def test_transpose(index_or_series_obj): + obj = index_or_series_obj + tm.assert_equal(obj.transpose(), obj) + + +def test_transpose_non_default_axes(index_or_series_obj): + msg = "the 'axes' parameter is not supported" + obj = index_or_series_obj + with pytest.raises(ValueError, match=msg): + obj.transpose(1) + with pytest.raises(ValueError, match=msg): + obj.transpose(axes=1) + + +def test_numpy_transpose(index_or_series_obj): + msg = "the 'axes' parameter is not supported" + obj = index_or_series_obj + tm.assert_equal(np.transpose(obj), obj) + + with pytest.raises(ValueError, match=msg): + np.transpose(obj, axes=1) diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py new file mode 100644 index 0000000000000..c6225c9b5ca64 --- /dev/null +++ b/pandas/tests/base/test_unique.py @@ -0,0 +1,107 @@ +import numpy as np +import pytest + +from pandas._libs.tslib import iNaT + +from pandas.core.dtypes.common import is_datetime64tz_dtype, needs_i8_conversion + +import pandas as pd +import pandas._testing as tm +from pandas.tests.base.common import allow_na_ops + + +def test_unique(index_or_series_obj): + obj = index_or_series_obj + obj = np.repeat(obj, range(1, len(obj) + 1)) + result = obj.unique() + + # dict.fromkeys preserves the order + unique_values = list(dict.fromkeys(obj.values)) + if isinstance(obj, pd.MultiIndex): + expected = pd.MultiIndex.from_tuples(unique_values) + expected.names = obj.names + tm.assert_index_equal(result, expected) + elif isinstance(obj, pd.Index): + expected = pd.Index(unique_values, dtype=obj.dtype) + if is_datetime64tz_dtype(obj): + expected = expected.normalize() + tm.assert_index_equal(result, expected) + else: + expected = np.array(unique_values) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("null_obj", [np.nan, None]) +def test_unique_null(null_obj, index_or_series_obj): + obj = index_or_series_obj + + if not allow_na_ops(obj): + pytest.skip("type doesn't allow for NA operations") + elif len(obj) < 1: + pytest.skip("Test doesn't make sense on empty data") + elif isinstance(obj, pd.MultiIndex): + pytest.skip(f"MultiIndex can't hold '{null_obj}'") + + values = obj.values + if needs_i8_conversion(obj): + values[0:2] = iNaT + else: + values[0:2] = null_obj + + klass = type(obj) + repeated_values = np.repeat(values, range(1, len(values) + 1)) + obj = klass(repeated_values, dtype=obj.dtype) + result = obj.unique() + + unique_values_raw = dict.fromkeys(obj.values) + # because np.nan == np.nan is False, but None == None is True + # np.nan would be duplicated, whereas None wouldn't + unique_values_not_null = [val for val in unique_values_raw if not pd.isnull(val)] + unique_values = [null_obj] + unique_values_not_null + + if isinstance(obj, pd.Index): + expected = pd.Index(unique_values, dtype=obj.dtype) + if is_datetime64tz_dtype(obj): + result = result.normalize() + expected = expected.normalize() + elif isinstance(obj, pd.CategoricalIndex): + expected = expected.set_categories(unique_values_not_null) + tm.assert_index_equal(result, expected) + else: + expected = np.array(unique_values, dtype=obj.dtype) + tm.assert_numpy_array_equal(result, expected) + + +def test_nunique(index_or_series_obj): + obj = index_or_series_obj + obj = np.repeat(obj, range(1, len(obj) + 1)) + expected = len(obj.unique()) + assert obj.nunique(dropna=False) == expected + + +@pytest.mark.parametrize("null_obj", [np.nan, None]) +def test_nunique_null(null_obj, index_or_series_obj): + obj = index_or_series_obj + + if not allow_na_ops(obj): + pytest.skip("type doesn't allow for NA operations") + elif isinstance(obj, pd.MultiIndex): + pytest.skip(f"MultiIndex can't hold '{null_obj}'") + + values = obj.values + if needs_i8_conversion(obj): + values[0:2] = iNaT + else: + values[0:2] = null_obj + + klass = type(obj) + repeated_values = np.repeat(values, range(1, len(values) + 1)) + obj = klass(repeated_values, dtype=obj.dtype) + + if isinstance(obj, pd.CategoricalIndex): + assert obj.nunique() == len(obj.categories) + assert obj.nunique(dropna=False) == len(obj.categories) + 1 + else: + num_unique_values = len(obj.unique()) + assert obj.nunique() == max(0, num_unique_values - 1) + assert obj.nunique(dropna=False) == max(0, num_unique_values) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py new file mode 100644 index 0000000000000..d45feaff68dde --- /dev/null +++ b/pandas/tests/base/test_value_counts.py @@ -0,0 +1,276 @@ +import collections +from datetime import timedelta +from io import StringIO + +import numpy as np +import pytest + +from pandas._libs.tslib import iNaT +from pandas.compat.numpy import np_array_datetime64_compat + +from pandas.core.dtypes.common import needs_i8_conversion + +import pandas as pd +from pandas import ( + DatetimeIndex, + Index, + Interval, + IntervalIndex, + Series, + Timedelta, + TimedeltaIndex, +) +import pandas._testing as tm +from pandas.tests.base.common import allow_na_ops + + +def test_value_counts(index_or_series_obj): + obj = index_or_series_obj + obj = np.repeat(obj, range(1, len(obj) + 1)) + result = obj.value_counts() + + counter = collections.Counter(obj) + expected = pd.Series(dict(counter.most_common()), dtype=np.int64, name=obj.name) + expected.index = expected.index.astype(obj.dtype) + if isinstance(obj, pd.MultiIndex): + expected.index = pd.Index(expected.index) + + # TODO: Order of entries with the same count is inconsistent on CI (gh-32449) + if obj.duplicated().any(): + result = result.sort_index() + expected = expected.sort_index() + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("null_obj", [np.nan, None]) +def test_value_counts_null(null_obj, index_or_series_obj): + orig = index_or_series_obj + obj = orig.copy() + + if not allow_na_ops(obj): + pytest.skip("type doesn't allow for NA operations") + elif len(obj) < 1: + pytest.skip("Test doesn't make sense on empty data") + elif isinstance(orig, pd.MultiIndex): + pytest.skip(f"MultiIndex can't hold '{null_obj}'") + + values = obj.values + if needs_i8_conversion(obj): + values[0:2] = iNaT + else: + values[0:2] = null_obj + + klass = type(obj) + repeated_values = np.repeat(values, range(1, len(values) + 1)) + obj = klass(repeated_values, dtype=obj.dtype) + + # because np.nan == np.nan is False, but None == None is True + # np.nan would be duplicated, whereas None wouldn't + counter = collections.Counter(obj.dropna()) + expected = pd.Series(dict(counter.most_common()), dtype=np.int64) + expected.index = expected.index.astype(obj.dtype) + + result = obj.value_counts() + if obj.duplicated().any(): + # TODO: + # Order of entries with the same count is inconsistent on CI (gh-32449) + expected = expected.sort_index() + result = result.sort_index() + tm.assert_series_equal(result, expected) + + # can't use expected[null_obj] = 3 as + # IntervalIndex doesn't allow assignment + new_entry = pd.Series({np.nan: 3}, dtype=np.int64) + expected = expected.append(new_entry) + + result = obj.value_counts(dropna=False) + if obj.duplicated().any(): + # TODO: + # Order of entries with the same count is inconsistent on CI (gh-32449) + expected = expected.sort_index() + result = result.sort_index() + tm.assert_series_equal(result, expected) + + +def test_value_counts_inferred(index_or_series): + klass = index_or_series + s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] + s = klass(s_values) + expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"]) + tm.assert_series_equal(s.value_counts(), expected) + + if isinstance(s, Index): + exp = Index(np.unique(np.array(s_values, dtype=np.object_))) + tm.assert_index_equal(s.unique(), exp) + else: + exp = np.unique(np.array(s_values, dtype=np.object_)) + tm.assert_numpy_array_equal(s.unique(), exp) + + assert s.nunique() == 4 + # don't sort, have to sort after the fact as not sorting is + # platform-dep + hist = s.value_counts(sort=False).sort_values() + expected = Series([3, 1, 4, 2], index=list("acbd")).sort_values() + tm.assert_series_equal(hist, expected) + + # sort ascending + hist = s.value_counts(ascending=True) + expected = Series([1, 2, 3, 4], index=list("cdab")) + tm.assert_series_equal(hist, expected) + + # relative histogram. + hist = s.value_counts(normalize=True) + expected = Series([0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"]) + tm.assert_series_equal(hist, expected) + + +def test_value_counts_bins(index_or_series): + klass = index_or_series + s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] + s = klass(s_values) + + # bins + msg = "bins argument only works with numeric data" + with pytest.raises(TypeError, match=msg): + s.value_counts(bins=1) + + s1 = Series([1, 1, 2, 3]) + res1 = s1.value_counts(bins=1) + exp1 = Series({Interval(0.997, 3.0): 4}) + tm.assert_series_equal(res1, exp1) + res1n = s1.value_counts(bins=1, normalize=True) + exp1n = Series({Interval(0.997, 3.0): 1.0}) + tm.assert_series_equal(res1n, exp1n) + + if isinstance(s1, Index): + tm.assert_index_equal(s1.unique(), Index([1, 2, 3])) + else: + exp = np.array([1, 2, 3], dtype=np.int64) + tm.assert_numpy_array_equal(s1.unique(), exp) + + assert s1.nunique() == 3 + + # these return the same + res4 = s1.value_counts(bins=4, dropna=True) + intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) + tm.assert_series_equal(res4, exp4) + + res4 = s1.value_counts(bins=4, dropna=False) + intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) + tm.assert_series_equal(res4, exp4) + + res4n = s1.value_counts(bins=4, normalize=True) + exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 3, 1, 2])) + tm.assert_series_equal(res4n, exp4n) + + # handle NA's properly + s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"] + s = klass(s_values) + expected = Series([4, 3, 2], index=["b", "a", "d"]) + tm.assert_series_equal(s.value_counts(), expected) + + if isinstance(s, Index): + exp = Index(["a", "b", np.nan, "d"]) + tm.assert_index_equal(s.unique(), exp) + else: + exp = np.array(["a", "b", np.nan, "d"], dtype=object) + tm.assert_numpy_array_equal(s.unique(), exp) + assert s.nunique() == 3 + + s = klass({}) if klass is dict else klass({}, dtype=object) + expected = Series([], dtype=np.int64) + tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) + # returned dtype differs depending on original + if isinstance(s, Index): + tm.assert_index_equal(s.unique(), Index([]), exact=False) + else: + tm.assert_numpy_array_equal(s.unique(), np.array([]), check_dtype=False) + + assert s.nunique() == 0 + + +def test_value_counts_datetime64(index_or_series): + klass = index_or_series + + # GH 3002, datetime64[ns] + # don't test names though + txt = "\n".join( + [ + "xxyyzz20100101PIE", + "xxyyzz20100101GUM", + "xxyyzz20100101EGG", + "xxyyww20090101EGG", + "foofoo20080909PIE", + "foofoo20080909GUM", + ] + ) + f = StringIO(txt) + df = pd.read_fwf( + f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"] + ) + + s = klass(df["dt"].copy()) + s.name = None + idx = pd.to_datetime( + ["2010-01-01 00:00:00", "2008-09-09 00:00:00", "2009-01-01 00:00:00"] + ) + expected_s = Series([3, 2, 1], index=idx) + tm.assert_series_equal(s.value_counts(), expected_s) + + expected = np_array_datetime64_compat( + ["2010-01-01 00:00:00", "2009-01-01 00:00:00", "2008-09-09 00:00:00"], + dtype="datetime64[ns]", + ) + if isinstance(s, Index): + tm.assert_index_equal(s.unique(), DatetimeIndex(expected)) + else: + tm.assert_numpy_array_equal(s.unique(), expected) + + assert s.nunique() == 3 + + # with NaT + s = df["dt"].copy() + s = klass(list(s.values) + [pd.NaT]) + + result = s.value_counts() + assert result.index.dtype == "datetime64[ns]" + tm.assert_series_equal(result, expected_s) + + result = s.value_counts(dropna=False) + expected_s[pd.NaT] = 1 + tm.assert_series_equal(result, expected_s) + + unique = s.unique() + assert unique.dtype == "datetime64[ns]" + + # numpy_array_equal cannot compare pd.NaT + if isinstance(s, Index): + exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT]) + tm.assert_index_equal(unique, exp_idx) + else: + tm.assert_numpy_array_equal(unique[:3], expected) + assert pd.isna(unique[3]) + + assert s.nunique() == 3 + assert s.nunique(dropna=False) == 4 + + # timedelta64[ns] + td = df.dt - df.dt + timedelta(1) + td = klass(td, name="dt") + + result = td.value_counts() + expected_s = Series([6], index=[Timedelta("1day")], name="dt") + tm.assert_series_equal(result, expected_s) + + expected = TimedeltaIndex(["1 days"], name="dt") + if isinstance(td, Index): + tm.assert_index_equal(td.unique(), expected) + else: + tm.assert_numpy_array_equal(td.unique(), expected.values) + + td2 = timedelta(1) + (df.dt - df.dt) + td2 = klass(td2, name="dt") + result2 = td2.value_counts() + tm.assert_series_equal(result2, expected_s) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index a240e6cef5930..08d8d5ca342b7 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -375,7 +375,8 @@ def check_pow(self, lhs, arith1, rhs): and is_scalar(rhs) and _is_py3_complex_incompat(result, expected) ): - with pytest.raises(AssertionError): + msg = "(DataFrame.columns|numpy array) are different" + with pytest.raises(AssertionError, match=msg): tm.assert_numpy_array_equal(result, expected) else: tm.assert_almost_equal(result, expected) @@ -449,16 +450,19 @@ def test_frame_invert(self): # float always raises lhs = DataFrame(randn(5, 2)) if self.engine == "numexpr": - with pytest.raises(NotImplementedError): + msg = "couldn't find matching opcode for 'invert_dd'" + with pytest.raises(NotImplementedError, match=msg): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: - with pytest.raises(TypeError): + msg = "ufunc 'invert' not supported for the input types" + with pytest.raises(TypeError, match=msg): result = pd.eval(expr, engine=self.engine, parser=self.parser) # int raises on numexpr lhs = DataFrame(randint(5, size=(5, 2))) if self.engine == "numexpr": - with pytest.raises(NotImplementedError): + msg = "couldn't find matching opcode for 'invert" + with pytest.raises(NotImplementedError, match=msg): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: expect = ~lhs @@ -474,10 +478,11 @@ def test_frame_invert(self): # object raises lhs = DataFrame({"b": ["a", 1, 2.0], "c": rand(3) > 0.5}) if self.engine == "numexpr": - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="unknown type object"): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: - with pytest.raises(TypeError): + msg = "bad operand type for unary ~: 'str'" + with pytest.raises(TypeError, match=msg): result = pd.eval(expr, engine=self.engine, parser=self.parser) def test_series_invert(self): @@ -488,16 +493,19 @@ def test_series_invert(self): # float raises lhs = Series(randn(5)) if self.engine == "numexpr": - with pytest.raises(NotImplementedError): + msg = "couldn't find matching opcode for 'invert_dd'" + with pytest.raises(NotImplementedError, match=msg): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: - with pytest.raises(TypeError): + msg = "ufunc 'invert' not supported for the input types" + with pytest.raises(TypeError, match=msg): result = pd.eval(expr, engine=self.engine, parser=self.parser) # int raises on numexpr lhs = Series(randint(5, size=5)) if self.engine == "numexpr": - with pytest.raises(NotImplementedError): + msg = "couldn't find matching opcode for 'invert" + with pytest.raises(NotImplementedError, match=msg): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: expect = ~lhs @@ -517,10 +525,11 @@ def test_series_invert(self): # object lhs = Series(["a", 1, 2.0]) if self.engine == "numexpr": - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="unknown type object"): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: - with pytest.raises(TypeError): + msg = "bad operand type for unary ~: 'str'" + with pytest.raises(TypeError, match=msg): result = pd.eval(expr, engine=self.engine, parser=self.parser) def test_frame_negate(self): @@ -541,7 +550,8 @@ def test_frame_negate(self): # bool doesn't work with numexpr but works elsewhere lhs = DataFrame(rand(5, 2) > 0.5) if self.engine == "numexpr": - with pytest.raises(NotImplementedError): + msg = "couldn't find matching opcode for 'neg_bb'" + with pytest.raises(NotImplementedError, match=msg): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: expect = -lhs @@ -566,7 +576,8 @@ def test_series_negate(self): # bool doesn't work with numexpr but works elsewhere lhs = Series(rand(5) > 0.5) if self.engine == "numexpr": - with pytest.raises(NotImplementedError): + msg = "couldn't find matching opcode for 'neg_bb'" + with pytest.raises(NotImplementedError, match=msg): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: expect = -lhs @@ -610,7 +621,8 @@ def test_series_pos(self, lhs): tm.assert_series_equal(expect, result) def test_scalar_unary(self): - with pytest.raises(TypeError): + msg = "bad operand type for unary ~: 'float'" + with pytest.raises(TypeError, match=msg): pd.eval("~1.0", engine=self.engine, parser=self.parser) assert pd.eval("-1.0", parser=self.parser, engine=self.engine) == -1.0 @@ -671,7 +683,8 @@ def test_disallow_scalar_bool_ops(self): x, a, b, df = np.random.randn(3), 1, 2, DataFrame(randn(3, 2)) # noqa for ex in exprs: - with pytest.raises(NotImplementedError): + msg = "cannot evaluate scalar only bool ops|'BoolOp' nodes are not" + with pytest.raises(NotImplementedError, match=msg): pd.eval(ex, engine=self.engine, parser=self.parser) def test_identical(self): @@ -772,7 +785,8 @@ def setup_ops(self): def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): ex1 = f"lhs {cmp1} mid {cmp2} rhs" - with pytest.raises(NotImplementedError): + msg = "'BoolOp' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): pd.eval(ex1, engine=self.engine, parser=self.parser) @@ -1183,7 +1197,8 @@ def test_bool_ops_with_constants(self): def test_4d_ndarray_fails(self): x = randn(3, 4, 5, 6) y = Series(randn(10)) - with pytest.raises(NotImplementedError): + msg = "N-dimensional objects, where N > 2, are not supported with eval" + with pytest.raises(NotImplementedError, match=msg): self.eval("x + y", local_dict={"x": x, "y": y}) def test_constant(self): @@ -1232,7 +1247,7 @@ def test_truediv(self): def test_failing_subscript_with_name_error(self): df = DataFrame(np.random.randn(5, 3)) # noqa - with pytest.raises(NameError): + with pytest.raises(NameError, match="name 'x' is not defined"): self.eval("df[x > 2] > 2") def test_lhs_expression_subscript(self): @@ -1379,7 +1394,8 @@ def test_multi_line_expression(self): assert ans is None # multi-line not valid if not all assignments - with pytest.raises(ValueError): + msg = "Multi-line expressions are only valid if all expressions contain" + with pytest.raises(ValueError, match=msg): df.eval( """ a = b + 2 @@ -1474,7 +1490,8 @@ def test_assignment_in_query(self): # GH 8664 df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df_orig = df.copy() - with pytest.raises(ValueError): + msg = "cannot assign without a target object" + with pytest.raises(ValueError, match=msg): df.query("a = 1") tm.assert_frame_equal(df, df_orig) @@ -1593,19 +1610,21 @@ def test_simple_in_ops(self): ) assert res else: - with pytest.raises(NotImplementedError): + msg = "'In' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): pd.eval("1 in [1, 2]", engine=self.engine, parser=self.parser) - with pytest.raises(NotImplementedError): + with pytest.raises(NotImplementedError, match=msg): pd.eval("2 in (1, 2)", engine=self.engine, parser=self.parser) - with pytest.raises(NotImplementedError): + with pytest.raises(NotImplementedError, match=msg): pd.eval("3 in (1, 2)", engine=self.engine, parser=self.parser) - with pytest.raises(NotImplementedError): - pd.eval("3 not in (1, 2)", engine=self.engine, parser=self.parser) - with pytest.raises(NotImplementedError): + with pytest.raises(NotImplementedError, match=msg): pd.eval( "[(3,)] in (1, 2, [(3,)])", engine=self.engine, parser=self.parser ) - with pytest.raises(NotImplementedError): + msg = "'NotIn' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): + pd.eval("3 not in (1, 2)", engine=self.engine, parser=self.parser) + with pytest.raises(NotImplementedError, match=msg): pd.eval( "[3] not in (1, 2, [[3]])", engine=self.engine, parser=self.parser ) @@ -1664,13 +1683,15 @@ def test_fails_not(self): def test_fails_ampersand(self): df = DataFrame(np.random.randn(5, 3)) # noqa ex = "(df + 2)[df > 1] > 0 & (df > 0)" - with pytest.raises(NotImplementedError): + msg = "cannot evaluate scalar only bool ops" + with pytest.raises(NotImplementedError, match=msg): pd.eval(ex, parser=self.parser, engine=self.engine) def test_fails_pipe(self): df = DataFrame(np.random.randn(5, 3)) # noqa ex = "(df + 2)[df > 1] > 0 | (df > 0)" - with pytest.raises(NotImplementedError): + msg = "cannot evaluate scalar only bool ops" + with pytest.raises(NotImplementedError, match=msg): pd.eval(ex, parser=self.parser, engine=self.engine) def test_bool_ops_with_constants(self): @@ -1679,7 +1700,8 @@ def test_bool_ops_with_constants(self): ): ex = f"{lhs} {op} {rhs}" if op in ("and", "or"): - with pytest.raises(NotImplementedError): + msg = "'BoolOp' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): self.eval(ex) else: res = self.eval(ex) @@ -1690,7 +1712,8 @@ def test_simple_bool_ops(self): for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), (True, False)): ex = f"lhs {op} rhs" if op in ("and", "or"): - with pytest.raises(NotImplementedError): + msg = "'BoolOp' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): pd.eval(ex, engine=self.engine, parser=self.parser) else: res = pd.eval(ex, engine=self.engine, parser=self.parser) @@ -1902,19 +1925,21 @@ def test_disallowed_nodes(engine, parser): inst = VisitorClass("x + 1", engine, parser) for ops in uns_ops: - with pytest.raises(NotImplementedError): + msg = "nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): getattr(inst, ops)() def test_syntax_error_exprs(engine, parser): e = "s +" - with pytest.raises(SyntaxError): + with pytest.raises(SyntaxError, match="invalid syntax"): pd.eval(e, engine=engine, parser=parser) def test_name_error_exprs(engine, parser): e = "s + t" - with pytest.raises(NameError): + msg = "name 's' is not defined" + with pytest.raises(NameError, match=msg): pd.eval(e, engine=engine, parser=parser) @@ -1973,7 +1998,8 @@ def test_bool_ops_fails_on_scalars(lhs, cmp, rhs, engine, parser): ex2 = f"lhs {cmp} mid and mid {cmp} rhs" ex3 = f"(lhs {cmp} mid) & (mid {cmp} rhs)" for ex in (ex1, ex2, ex3): - with pytest.raises(NotImplementedError): + msg = "cannot evaluate scalar only bool ops|'BoolOp' nodes are not" + with pytest.raises(NotImplementedError, match=msg): pd.eval(ex, engine=engine, parser=parser) @@ -2029,7 +2055,8 @@ def test_negate_lt_eq_le(engine, parser): tm.assert_frame_equal(result, expected) if parser == "python": - with pytest.raises(NotImplementedError): + msg = "'Not' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): df.query("not (cat > 0)", engine=engine, parser=parser) else: result = df.query("not (cat > 0)", engine=engine, parser=parser) @@ -2041,5 +2068,6 @@ def test_validate_bool_args(self): invalid_values = [1, "True", [1, 2, 3], 5.0] for value in invalid_values: - with pytest.raises(ValueError): + msg = 'For argument "inplace" expected type bool, received type' + with pytest.raises(ValueError, match=msg): pd.eval("2+2", inplace=value) diff --git a/pandas/tests/dtypes/cast/test_construct_from_scalar.py b/pandas/tests/dtypes/cast/test_construct_from_scalar.py index cc823a3d6e02c..ed272cef3e7ba 100644 --- a/pandas/tests/dtypes/cast/test_construct_from_scalar.py +++ b/pandas/tests/dtypes/cast/test_construct_from_scalar.py @@ -15,6 +15,4 @@ def test_cast_1d_array_like_from_scalar_categorical(): expected = Categorical(["a", "a"], categories=cats) result = construct_1d_arraylike_from_scalar("a", len(expected), cat_type) - tm.assert_categorical_equal( - result, expected, check_category_order=True, check_dtype=True - ) + tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/dtypes/cast/test_upcast.py b/pandas/tests/dtypes/cast/test_upcast.py index bb7a7d059c7ee..f9227a4e78a79 100644 --- a/pandas/tests/dtypes/cast/test_upcast.py +++ b/pandas/tests/dtypes/cast/test_upcast.py @@ -12,7 +12,7 @@ def test_upcast_error(result): # GH23823 require result arg to be ndarray mask = np.array([False, True, False]) other = np.array([61, 62, 63]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="The result input must be a ndarray"): result, _ = maybe_upcast_putmask(result, mask, other) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 8da2797835080..6e73e1542bb80 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -203,11 +203,18 @@ def test_is_scipy_sparse(): def test_is_categorical(): cat = pd.Categorical([1, 2, 3]) - assert com.is_categorical(cat) - assert com.is_categorical(pd.Series(cat)) - assert com.is_categorical(pd.CategoricalIndex([1, 2, 3])) + with tm.assert_produces_warning(FutureWarning): + assert com.is_categorical(cat) + assert com.is_categorical(pd.Series(cat)) + assert com.is_categorical(pd.CategoricalIndex([1, 2, 3])) + + assert not com.is_categorical([1, 2, 3]) - assert not com.is_categorical([1, 2, 3]) + +def test_is_categorical_deprecation(): + # GH#33385 + with tm.assert_produces_warning(FutureWarning): + com.is_categorical([1, 2, 3]) def test_is_datetime64_dtype(): @@ -281,18 +288,6 @@ def test_is_string_dtype(): assert com.is_string_dtype(pd.array(["a", "b"], dtype="string")) -def test_is_period_arraylike(): - assert not com.is_period_arraylike([1, 2, 3]) - assert not com.is_period_arraylike(pd.Index([1, 2, 3])) - assert com.is_period_arraylike(pd.PeriodIndex(["2017-01-01"], freq="D")) - - -def test_is_datetime_arraylike(): - assert not com.is_datetime_arraylike([1, 2, 3]) - assert not com.is_datetime_arraylike(pd.Index([1, 2, 3])) - assert com.is_datetime_arraylike(pd.DatetimeIndex([1, 2, 3])) - - integer_dtypes: List = [] diff --git a/pandas/tests/dtypes/test_concat.py b/pandas/tests/dtypes/test_concat.py index 02daa185b1cdb..1fbbd3356ae13 100644 --- a/pandas/tests/dtypes/test_concat.py +++ b/pandas/tests/dtypes/test_concat.py @@ -2,7 +2,9 @@ import pandas.core.dtypes.concat as _concat +import pandas as pd from pandas import DatetimeIndex, Period, PeriodIndex, Series, TimedeltaIndex +import pandas._testing as tm @pytest.mark.parametrize( @@ -76,3 +78,13 @@ def test_get_dtype_kinds(index_or_series, to_concat, expected): def test_get_dtype_kinds_period(to_concat, expected): result = _concat.get_dtype_kinds(to_concat) assert result == set(expected) + + +def test_concat_mismatched_categoricals_with_empty(): + # concat_compat behavior on series._values should match pd.concat on series + ser1 = Series(["a", "b", "c"], dtype="category") + ser2 = Series([], dtype="category") + + result = _concat.concat_compat([ser1._values, ser2._values]) + expected = pd.concat([ser1, ser2])._values + tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 9eb5fda87d2d2..519f2f3eead1c 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -26,7 +26,14 @@ ) import pandas as pd -from pandas import Categorical, CategoricalIndex, IntervalIndex, Series, date_range +from pandas import ( + Categorical, + CategoricalIndex, + DatetimeIndex, + IntervalIndex, + Series, + date_range, +) import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray, SparseDtype @@ -152,10 +159,12 @@ def test_basic(self, dtype): assert is_categorical_dtype(s) assert not is_categorical_dtype(np.dtype("float64")) - assert is_categorical(s.dtype) - assert is_categorical(s) - assert not is_categorical(np.dtype("float64")) - assert not is_categorical(1.0) + with tm.assert_produces_warning(FutureWarning): + # GH#33385 deprecated + assert is_categorical(s.dtype) + assert is_categorical(s) + assert not is_categorical(np.dtype("float64")) + assert not is_categorical(1.0) def test_tuple_categories(self): categories = [(1, "a"), (2, "b"), (3, "c")] @@ -177,6 +186,11 @@ def test_is_boolean(self, categories, expected): assert is_bool_dtype(cat) is expected assert is_bool_dtype(cat.dtype) is expected + def test_dtype_specific_categorical_dtype(self): + expected = "datetime64[ns]" + result = str(Categorical(DatetimeIndex([])).categories.dtype) + assert result == expected + class TestDatetimeTZDtype(Base): @pytest.fixture @@ -349,7 +363,7 @@ def test_hash_vs_equality(self, dtype): assert hash(dtype) == hash(dtype3) def test_construction(self): - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Invalid frequency: xx"): PeriodDtype("xx") for s in ["period[D]", "Period[D]", "D"]: @@ -402,21 +416,25 @@ def test_construction_from_string(self, dtype): assert is_dtype_equal(dtype, result) result = PeriodDtype.construct_from_string("period[D]") assert is_dtype_equal(dtype, result) - with pytest.raises(TypeError): - PeriodDtype.construct_from_string("foo") - with pytest.raises(TypeError): - PeriodDtype.construct_from_string("period[foo]") - with pytest.raises(TypeError): - PeriodDtype.construct_from_string("foo[D]") - - with pytest.raises(TypeError): - PeriodDtype.construct_from_string("datetime64[ns]") - with pytest.raises(TypeError): - PeriodDtype.construct_from_string("datetime64[ns, US/Eastern]") with pytest.raises(TypeError, match="list"): PeriodDtype.construct_from_string([1, 2, 3]) + @pytest.mark.parametrize( + "string", + [ + "foo", + "period[foo]", + "foo[D]", + "datetime64[ns]", + "datetime64[ns, US/Eastern]", + ], + ) + def test_construct_dtype_from_string_invalid_raises(self, string): + msg = f"Cannot construct a 'PeriodDtype' from '{string}'" + with pytest.raises(TypeError, match=re.escape(msg)): + PeriodDtype.construct_from_string(string) + def test_is_dtype(self, dtype): assert PeriodDtype.is_dtype(dtype) assert PeriodDtype.is_dtype("period[D]") @@ -463,7 +481,8 @@ def test_basic(self, dtype): def test_empty(self): dt = PeriodDtype() - with pytest.raises(AttributeError): + msg = "object has no attribute 'freqstr'" + with pytest.raises(AttributeError, match=msg): str(dt) def test_not_string(self): @@ -713,10 +732,10 @@ class TestCategoricalDtypeParametrized: pd.date_range("2017", periods=4), ], ) - def test_basic(self, categories, ordered_fixture): - c1 = CategoricalDtype(categories, ordered=ordered_fixture) + def test_basic(self, categories, ordered): + c1 = CategoricalDtype(categories, ordered=ordered) tm.assert_index_equal(c1.categories, pd.Index(categories)) - assert c1.ordered is ordered_fixture + assert c1.ordered is ordered def test_order_matters(self): categories = ["a", "b"] @@ -737,7 +756,7 @@ def test_categories(self): tm.assert_index_equal(result.categories, pd.Index(["a", "b", "c"])) assert result.ordered is False - def test_equal_but_different(self, ordered_fixture): + def test_equal_but_different(self, ordered): c1 = CategoricalDtype([1, 2, 3]) c2 = CategoricalDtype([1.0, 2.0, 3.0]) assert c1 is not c2 @@ -752,11 +771,13 @@ def test_order_hashes_different(self, v1, v2): assert c1 is not c3 def test_nan_invalid(self): - with pytest.raises(ValueError): + msg = "Categorical categories cannot be null" + with pytest.raises(ValueError, match=msg): CategoricalDtype([1, 2, np.nan]) def test_non_unique_invalid(self): - with pytest.raises(ValueError): + msg = "Categorical categories must be unique" + with pytest.raises(ValueError, match=msg): CategoricalDtype([1, 2, 1]) def test_same_categories_different_order(self): @@ -799,8 +820,8 @@ def test_categorical_equality(self, ordered1, ordered2): @pytest.mark.parametrize("categories", [list("abc"), None]) @pytest.mark.parametrize("other", ["category", "not a category"]) - def test_categorical_equality_strings(self, categories, ordered_fixture, other): - c1 = CategoricalDtype(categories, ordered_fixture) + def test_categorical_equality_strings(self, categories, ordered, other): + c1 = CategoricalDtype(categories, ordered) result = c1 == other expected = other == "category" assert result is expected @@ -843,12 +864,12 @@ def test_from_categorical_dtype_both(self): ) assert result == CategoricalDtype([1, 2], ordered=False) - def test_str_vs_repr(self, ordered_fixture): - c1 = CategoricalDtype(["a", "b"], ordered=ordered_fixture) + def test_str_vs_repr(self, ordered): + c1 = CategoricalDtype(["a", "b"], ordered=ordered) assert str(c1) == "category" # Py2 will have unicode prefixes pat = r"CategoricalDtype\(categories=\[.*\], ordered={ordered}\)" - assert re.match(pat.format(ordered=ordered_fixture), repr(c1)) + assert re.match(pat.format(ordered=ordered), repr(c1)) def test_categorical_categories(self): # GH17884 @@ -861,9 +882,9 @@ def test_categorical_categories(self): "new_categories", [list("abc"), list("cba"), list("wxyz"), None] ) @pytest.mark.parametrize("new_ordered", [True, False, None]) - def test_update_dtype(self, ordered_fixture, new_categories, new_ordered): + def test_update_dtype(self, ordered, new_categories, new_ordered): original_categories = list("abc") - dtype = CategoricalDtype(original_categories, ordered_fixture) + dtype = CategoricalDtype(original_categories, ordered) new_dtype = CategoricalDtype(new_categories, new_ordered) result = dtype.update_dtype(new_dtype) @@ -873,8 +894,8 @@ def test_update_dtype(self, ordered_fixture, new_categories, new_ordered): tm.assert_index_equal(result.categories, expected_categories) assert result.ordered is expected_ordered - def test_update_dtype_string(self, ordered_fixture): - dtype = CategoricalDtype(list("abc"), ordered_fixture) + def test_update_dtype_string(self, ordered): + dtype = CategoricalDtype(list("abc"), ordered) expected_categories = dtype.categories expected_ordered = dtype.ordered result = dtype.update_dtype("category") diff --git a/pandas/tests/dtypes/test_generic.py b/pandas/tests/dtypes/test_generic.py index 2c8631ac2d71d..f9ee943d9e6bf 100644 --- a/pandas/tests/dtypes/test_generic.py +++ b/pandas/tests/dtypes/test_generic.py @@ -37,13 +37,10 @@ def test_abc_types(self): assert isinstance(self.df, gt.ABCDataFrame) assert isinstance(self.sparse_array, gt.ABCSparseArray) assert isinstance(self.categorical, gt.ABCCategorical) - assert isinstance(pd.Period("2012", freq="A-DEC"), gt.ABCPeriod) assert isinstance(pd.DateOffset(), gt.ABCDateOffset) assert isinstance(pd.Period("2012", freq="A-DEC").freq, gt.ABCDateOffset) assert not isinstance(pd.Period("2012", freq="A-DEC"), gt.ABCDateOffset) - assert isinstance(pd.Interval(0, 1.5), gt.ABCInterval) - assert not isinstance(pd.Period("2012", freq="A-DEC"), gt.ABCInterval) assert isinstance(self.datetime_array, gt.ABCDatetimeArray) assert not isinstance(self.datetime_index, gt.ABCDatetimeArray) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 48ae1f67297af..82d6b1df19393 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -16,7 +16,7 @@ import pytest import pytz -from pandas._libs import iNaT, lib, missing as libmissing +from pandas._libs import lib, missing as libmissing import pandas.util._test_decorators as td from pandas.core.dtypes import inference @@ -50,7 +50,6 @@ Timedelta, TimedeltaIndex, Timestamp, - isna, ) import pandas._testing as tm from pandas.core.arrays import IntegerArray @@ -507,6 +506,13 @@ def test_convert_numeric_int64_uint64(self, case, coerce): result = lib.maybe_convert_numeric(case, set(), coerce_numeric=coerce) tm.assert_almost_equal(result, expected) + def test_convert_numeric_string_uint64(self): + # GH32394 + result = lib.maybe_convert_numeric( + np.array(["uint64"], dtype=object), set(), coerce_numeric=True + ) + assert np.isnan(result) + @pytest.mark.parametrize("value", [-(2 ** 63) - 1, 2 ** 64]) def test_convert_int_overflow(self, value): # see gh-18584 @@ -568,6 +574,13 @@ def test_maybe_convert_objects_nullable_integer(self, exp): tm.assert_extension_array_equal(result, exp) + def test_maybe_convert_objects_bool_nan(self): + # GH32146 + ind = pd.Index([True, False, np.nan], dtype=object) + exp = np.array([True, False, np.nan], dtype=object) + out = lib.maybe_convert_objects(ind.values, safe=1) + tm.assert_numpy_array_equal(out, exp) + def test_mixed_dtypes_remain_object_array(self): # GH14956 array = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], dtype=object) @@ -1423,6 +1436,7 @@ def test_is_scalar_pandas_scalars(self): assert is_scalar(Period("2014-01-01")) assert is_scalar(Interval(left=0, right=1)) assert is_scalar(DateOffset(days=1)) + assert is_scalar(pd.offsets.Minute(3)) def test_is_scalar_pandas_containers(self): assert not is_scalar(Series(dtype=object)) @@ -1431,6 +1445,11 @@ def test_is_scalar_pandas_containers(self): assert not is_scalar(DataFrame([[1]])) assert not is_scalar(Index([])) assert not is_scalar(Index([1])) + assert not is_scalar(Categorical([])) + assert not is_scalar(DatetimeIndex([])._data) + assert not is_scalar(TimedeltaIndex([])._data) + assert not is_scalar(DatetimeIndex([])._data.to_period("D")) + assert not is_scalar(pd.array([1, 2, 3])) def test_is_scalar_number(self): # Number() is not recognied by PyNumber_Check, so by extension @@ -1460,14 +1479,12 @@ def test_nan_to_nat_conversions(): dict({"A": np.asarray(range(10), dtype="float64"), "B": Timestamp("20010101")}) ) df.iloc[3:6, :] = np.nan - result = df.loc[4, "B"].value - assert result == iNaT + result = df.loc[4, "B"] + assert result is pd.NaT s = df["B"].copy() - s._data = s._data.setitem(indexer=tuple([slice(8, 9)]), value=np.nan) - assert isna(s[8]) - - assert s[8].value == np.datetime64("NaT").astype(np.int64) + s[8:9] = np.nan + assert s[8] is pd.NaT @td.skip_if_no_scipy diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 7ba59786bb0fa..cad46d0a23967 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -185,6 +185,21 @@ def test_isna_datetime(self): exp = np.zeros(len(mask), dtype=bool) tm.assert_numpy_array_equal(mask, exp) + def test_isna_old_datetimelike(self): + # isna_old should work for dt64tz, td64, and period, not just tznaive + dti = pd.date_range("2016-01-01", periods=3) + dta = dti._data + dta[-1] = pd.NaT + expected = np.array([False, False, True], dtype=bool) + + objs = [dta, dta.tz_localize("US/Eastern"), dta - dta, dta.to_period("D")] + + for obj in objs: + with cf.option_context("mode.use_inf_as_na", True): + result = pd.isna(obj) + + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize( "value, expected", [ diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 58859fc6ac54c..f33f960e8e341 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -10,9 +10,21 @@ class BaseCastingTests(BaseExtensionTests): """Casting to and from ExtensionDtypes""" def test_astype_object_series(self, all_data): - ser = pd.Series({"A": all_data}) + ser = pd.Series(all_data, name="A") result = ser.astype(object) - assert isinstance(result._data.blocks[0], ObjectBlock) + assert isinstance(result._mgr.blocks[0], ObjectBlock) + + def test_astype_object_frame(self, all_data): + df = pd.DataFrame({"A": all_data}) + + result = df.astype(object) + blk = result._data.blocks[0] + assert isinstance(blk, ObjectBlock), type(blk) + + # FIXME: these currently fail; dont leave commented-out + # check that we can compare the dtypes + # cmp = result.dtypes.equals(df.dtypes) + # assert not cmp.any() def test_tolist(self, data): result = pd.Series(data).tolist() diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py index c40646ca2415e..1ddc7af0f6268 100644 --- a/pandas/tests/extension/base/constructors.py +++ b/pandas/tests/extension/base/constructors.py @@ -25,13 +25,13 @@ def test_series_constructor(self, data): result = pd.Series(data) assert result.dtype == data.dtype assert len(result) == len(data) - assert isinstance(result._data.blocks[0], ExtensionBlock) - assert result._data.blocks[0].values is data + assert isinstance(result._mgr.blocks[0], ExtensionBlock) + assert result._mgr.blocks[0].values is data # Series[EA] is unboxed / boxed correctly result2 = pd.Series(result) assert result2.dtype == data.dtype - assert isinstance(result2._data.blocks[0], ExtensionBlock) + assert isinstance(result2._mgr.blocks[0], ExtensionBlock) @pytest.mark.parametrize("from_series", [True, False]) def test_dataframe_constructor_from_dict(self, data, from_series): @@ -40,13 +40,13 @@ def test_dataframe_constructor_from_dict(self, data, from_series): result = pd.DataFrame({"A": data}) assert result.dtypes["A"] == data.dtype assert result.shape == (len(data), 1) - assert isinstance(result._data.blocks[0], ExtensionBlock) + assert isinstance(result._mgr.blocks[0], ExtensionBlock) def test_dataframe_from_series(self, data): result = pd.DataFrame(pd.Series(data)) assert result.dtypes[0] == data.dtype assert result.shape == (len(data), 1) - assert isinstance(result._data.blocks[0], ExtensionBlock) + assert isinstance(result._mgr.blocks[0], ExtensionBlock) def test_series_given_mismatched_index_raises(self, data): msg = "Length of passed values is 3, index implies 5" diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index b08a64cc076b6..dc94bffd320b1 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -40,6 +40,34 @@ def test_iloc_frame(self, data): result = df.iloc[:4, 0] self.assert_series_equal(result, expected) + # GH#32959 slice columns with step + result = df.iloc[:, ::2] + self.assert_frame_equal(result, df[["A"]]) + result = df[["B", "A"]].iloc[:, ::2] + self.assert_frame_equal(result, df[["B"]]) + + def test_iloc_frame_single_block(self, data): + # GH#32959 null slice along index, slice along columns with single-block + df = pd.DataFrame({"A": data}) + + result = df.iloc[:, :] + self.assert_frame_equal(result, df) + + result = df.iloc[:, :1] + self.assert_frame_equal(result, df) + + result = df.iloc[:, :2] + self.assert_frame_equal(result, df) + + result = df.iloc[:, ::2] + self.assert_frame_equal(result, df) + + result = df.iloc[:, 1:2] + self.assert_frame_equal(result, df.iloc[:, :0]) + + result = df.iloc[:, -1:] + self.assert_frame_equal(result, df) + def test_loc_series(self, data): ser = pd.Series(data) result = ser.loc[:3] @@ -356,7 +384,7 @@ def test_loc_len1(self, data): # see GH-27785 take_nd with indexer of len 1 resulting in wrong ndim df = pd.DataFrame({"A": data}) res = df.loc[[0], "A"] - assert res._data._block.ndim == 1 + assert res._mgr._block.ndim == 1 def test_item(self, data): # https://github.com/pandas-dev/pandas/pull/30175 diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index cdea96334be2a..9ae4b01508d79 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -19,6 +19,9 @@ class BaseInterfaceTests(BaseExtensionTests): def test_len(self, data): assert len(data) == 100 + def test_size(self, data): + assert data.size == 100 + def test_ndim(self, data): assert data.ndim == 1 @@ -53,7 +56,7 @@ def test_no_values_attribute(self, data): def test_is_numeric_honored(self, data): result = pd.Series(data) - assert result._data.blocks[0].is_numeric is data.dtype._is_numeric + assert result._mgr.blocks[0].is_numeric is data.dtype._is_numeric def test_isna_extension_array(self, data_missing): # If your `isna` returns an ExtensionArray, you must also implement diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index ec21898852888..c9445ceec2c77 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -27,7 +27,7 @@ def test_concat(self, data, in_frame): dtype = result.dtype assert dtype == data.dtype - assert isinstance(result._data.blocks[0], ExtensionBlock) + assert isinstance(result._mgr.blocks[0], ExtensionBlock) @pytest.mark.parametrize("in_frame", [True, False]) def test_concat_all_na_block(self, data_missing, in_frame): @@ -295,6 +295,14 @@ def test_unstack(self, data, index, obj): assert all( isinstance(result[col].array, type(data)) for col in result.columns ) + + if obj == "series": + # We should get the same result with to_frame+unstack+droplevel + df = ser.to_frame() + + alt = df.unstack(level=level).droplevel(0, axis=1) + self.assert_frame_equal(result, alt) + expected = ser.astype(object).unstack(level=level) result = result.astype(object) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index a4fe89df158fa..dece8098c8542 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -1,5 +1,3 @@ -import operator - import numpy as np import pytest @@ -60,7 +58,7 @@ def test_setitem_sequence_broadcasts(self, data, box_in_series): def test_setitem_scalar(self, data, setter): arr = pd.Series(data) setter = getattr(arr, setter) - operator.setitem(setter, 0, data[1]) + setter[0] = data[1] assert arr[0] == data[1] def test_setitem_loc_scalar_mixed(self, data): @@ -196,7 +194,7 @@ def test_setitem_mask_aligned(self, data, as_callable, setter): # Series.__setitem__ target = ser - operator.setitem(target, mask2, data[5:7]) + target[mask2] = data[5:7] ser[mask2] = data[5:7] assert ser[0] == data[5] @@ -213,7 +211,7 @@ def test_setitem_mask_broadcast(self, data, setter): else: # __setitem__ target = ser - operator.setitem(target, mask, data[10]) + target[mask] = data[10] assert ser[0] == data[10] assert ser[1] == data[10] diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 9384ed5199c1f..85d8ad6ec6e38 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -79,7 +79,9 @@ def _from_factorized(cls, values, original): _HANDLED_TYPES = (decimal.Decimal, numbers.Number, np.ndarray) - def to_numpy(self, dtype=None, copy=False, na_value=no_default, decimals=None): + def to_numpy( + self, dtype=None, copy: bool = False, na_value=no_default, decimals=None + ) -> np.ndarray: result = np.asarray(self, dtype=dtype) if decimals is not None: result = np.asarray([round(x, decimals) for x in result]) diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 69a97f5c9fe02..059d3453995bd 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -282,6 +282,19 @@ def _compare_other(self, s, data, op_name, other): with pytest.raises(TypeError, match=msg): op(data, other) + @pytest.mark.parametrize( + "categories", + [["a", "b"], [0, 1], [pd.Timestamp("2019"), pd.Timestamp("2020")]], + ) + def test_not_equal_with_na(self, categories): + # https://github.com/pandas-dev/pandas/issues/32276 + c1 = Categorical.from_codes([-1, 0], categories=categories) + c2 = Categorical.from_codes([0, 1], categories=categories) + + result = c1 != c2 + + assert result.all() + class TestParsing(base.BaseParsingTests): pass diff --git a/pandas/tests/extension/test_external_block.py b/pandas/tests/extension/test_external_block.py index 6311070cfe2bb..9925fd51561ae 100644 --- a/pandas/tests/extension/test_external_block.py +++ b/pandas/tests/extension/test_external_block.py @@ -3,12 +3,13 @@ import pandas as pd from pandas.core.internals import BlockManager -from pandas.core.internals.blocks import Block, NonConsolidatableMixIn +from pandas.core.internals.blocks import ExtensionBlock -class CustomBlock(NonConsolidatableMixIn, Block): +class CustomBlock(ExtensionBlock): _holder = np.ndarray + _can_hold_na = False def concat_same_type(self, to_concat, placement=None): """ @@ -16,15 +17,14 @@ def concat_same_type(self, to_concat, placement=None): always 1D in this custom Block """ values = np.concatenate([blk.values for blk in to_concat]) - return self.make_block_same_class( - values, placement=placement or slice(0, len(values), 1) - ) + placement = self.mgr_locs if self.ndim == 2 else slice(len(values)) + return self.make_block_same_class(values, placement=placement) @pytest.fixture def df(): df1 = pd.DataFrame({"a": [1, 2, 3]}) - blocks = df1._data.blocks + blocks = df1._mgr.blocks values = np.arange(3, dtype="int64") custom_block = CustomBlock(values, placement=slice(1, 2)) blocks = blocks + (custom_block,) @@ -32,24 +32,14 @@ def df(): return pd.DataFrame(block_manager) -def test_concat_series(): - # GH17728 - values = np.arange(3, dtype="int64") - block = CustomBlock(values, placement=slice(0, 3)) - s = pd.Series(block, pd.RangeIndex(3), fastpath=True) - - res = pd.concat([s, s]) - assert isinstance(res._data.blocks[0], CustomBlock) - - def test_concat_dataframe(df): # GH17728 res = pd.concat([df, df]) - assert isinstance(res._data.blocks[1], CustomBlock) + assert isinstance(res._mgr.blocks[1], CustomBlock) def test_concat_axis1(df): # GH17954 df2 = pd.DataFrame({"c": [0.1, 0.2, 0.3]}) res = pd.concat([df, df2], axis=1) - assert isinstance(res._data.blocks[1], CustomBlock) + assert isinstance(res._mgr.blocks[1], CustomBlock) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index f55ec75b47dfa..725533765ca2c 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -238,9 +238,10 @@ def check_reduce(self, s, op_name, skipna): # overwrite to ensure pd.NA is tested instead of np.nan # https://github.com/pandas-dev/pandas/issues/30958 result = getattr(s, op_name)(skipna=skipna) - expected = getattr(s.astype("float64"), op_name)(skipna=skipna) - if np.isnan(expected): + if not skipna and s.isna().any(): expected = pd.NA + else: + expected = getattr(s.dropna().astype("int64"), op_name)(skipna=skipna) tm.assert_almost_equal(result, expected) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 61c5925383f88..aa5a99282131a 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -170,8 +170,11 @@ def test_take_series(self, data): # ValueError: PandasArray must be 1-dimensional. super().test_take_series(data) - @pytest.mark.xfail(reason="astype doesn't recognize data.dtype") def test_loc_iloc_frame_single_dtype(self, data): + npdtype = data.dtype.numpy_dtype + if npdtype == object or npdtype == np.float64: + # GH#33125 + pytest.xfail(reason="GH#33125 astype doesn't recognize data.dtype") super().test_loc_iloc_frame_single_dtype(data) @@ -179,6 +182,8 @@ class TestGroupby(BaseNumPyTests, base.BaseGroupbyTests): @skip_nested def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): # ValueError: Names should be list-like for a MultiIndex + if data_for_grouping.dtype.numpy_dtype == np.float64: + pytest.xfail(reason="GH#33125 astype doesn't recognize data.dtype") super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) @@ -276,7 +281,11 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators): class TestPrinting(BaseNumPyTests, base.BasePrintingTests): - pass + @pytest.mark.xfail( + reason="GH#33125 PandasArray.astype does not recognize PandasDtype" + ) + def test_series_repr(self, data): + super().test_series_repr(data) @skip_nested @@ -321,6 +330,18 @@ class TestReshaping(BaseNumPyTests, base.BaseReshapingTests): def test_concat_mixed_dtypes(self, data): super().test_concat_mixed_dtypes(data) + @pytest.mark.xfail( + reason="GH#33125 PandasArray.astype does not recognize PandasDtype" + ) + def test_concat(self, data, in_frame): + super().test_concat(data, in_frame) + + @pytest.mark.xfail( + reason="GH#33125 PandasArray.astype does not recognize PandasDtype" + ) + def test_concat_all_na_block(self, data_missing, in_frame): + super().test_concat_all_na_block(data_missing, in_frame) + @skip_nested def test_merge(self, data, na_value): # Fails creating expected diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 198a228b621b4..694bbee59606f 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -3,6 +3,8 @@ from pandas.errors import PerformanceWarning +from pandas.core.dtypes.common import is_object_dtype + import pandas as pd from pandas import SparseDtype import pandas._testing as tm @@ -309,7 +311,25 @@ def test_searchsorted(self, data_for_sorting, as_series): class TestCasting(BaseSparseTests, base.BaseCastingTests): - pass + def test_astype_object_series(self, all_data): + # Unlike the base class, we do not expect the resulting Block + # to be ObjectBlock + ser = pd.Series(all_data, name="A") + result = ser.astype(object) + assert is_object_dtype(result._data.blocks[0].dtype) + + def test_astype_object_frame(self, all_data): + # Unlike the base class, we do not expect the resulting Block + # to be ObjectBlock + df = pd.DataFrame({"A": all_data}) + + result = df.astype(object) + assert is_object_dtype(result._data.blocks[0].dtype) + + # FIXME: these currently fail; dont leave commented-out + # check that we can compare the dtypes + # comp = result.dtypes.equals(df.dtypes) + # assert not comp.any() class TestArithmeticOps(BaseSparseTests, base.BaseArithmeticOpsTests): diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index 03598b6bb5eca..486d140849159 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -40,8 +40,8 @@ def float_frame_with_na(): """ df = DataFrame(tm.getSeriesData()) # set some NAs - df.loc[5:10] = np.nan - df.loc[15:20, -2:] = np.nan + df.iloc[5:10] = np.nan + df.iloc[15:20, -2:] = np.nan return df @@ -74,71 +74,11 @@ def bool_frame_with_na(): df = DataFrame(tm.getSeriesData()) > 0 df = df.astype(object) # set some NAs - df.loc[5:10] = np.nan - df.loc[15:20, -2:] = np.nan + df.iloc[5:10] = np.nan + df.iloc[15:20, -2:] = np.nan return df -@pytest.fixture -def int_frame(): - """ - Fixture for DataFrame of ints with index of unique strings - - Columns are ['A', 'B', 'C', 'D'] - - A B C D - vpBeWjM651 1 0 1 0 - 5JyxmrP1En -1 0 0 0 - qEDaoD49U2 -1 1 0 0 - m66TkTfsFe 0 0 0 0 - EHPaNzEUFm -1 0 -1 0 - fpRJCevQhi 2 0 0 0 - OlQvnmfi3Q 0 0 -2 0 - ... .. .. .. .. - uB1FPlz4uP 0 0 0 1 - EcSe6yNzCU 0 0 -1 0 - L50VudaiI8 -1 1 -2 0 - y3bpw4nwIp 0 -1 0 0 - H0RdLLwrCT 1 1 0 0 - rY82K0vMwm 0 0 0 0 - 1OPIUjnkjk 2 0 0 0 - - [30 rows x 4 columns] - """ - df = DataFrame({k: v.astype(int) for k, v in tm.getSeriesData().items()}) - # force these all to int64 to avoid platform testing issues - return DataFrame({c: s for c, s in df.items()}, dtype=np.int64) - - -@pytest.fixture -def datetime_frame(): - """ - Fixture for DataFrame of floats with DatetimeIndex - - Columns are ['A', 'B', 'C', 'D'] - - A B C D - 2000-01-03 -1.122153 0.468535 0.122226 1.693711 - 2000-01-04 0.189378 0.486100 0.007864 -1.216052 - 2000-01-05 0.041401 -0.835752 -0.035279 -0.414357 - 2000-01-06 0.430050 0.894352 0.090719 0.036939 - 2000-01-07 -0.620982 -0.668211 -0.706153 1.466335 - 2000-01-10 -0.752633 0.328434 -0.815325 0.699674 - 2000-01-11 -2.236969 0.615737 -0.829076 -1.196106 - ... ... ... ... ... - 2000-02-03 1.642618 -0.579288 0.046005 1.385249 - 2000-02-04 -0.544873 -1.160962 -0.284071 -1.418351 - 2000-02-07 -2.656149 -0.601387 1.410148 0.444150 - 2000-02-08 -1.201881 -1.289040 0.772992 -1.445300 - 2000-02-09 1.377373 0.398619 1.008453 -0.928207 - 2000-02-10 0.473194 -0.636677 0.984058 0.511519 - 2000-02-11 -0.965556 0.408313 -1.312844 -0.381948 - - [30 rows x 4 columns] - """ - return DataFrame(tm.getTimeSeriesData()) - - @pytest.fixture def float_string_frame(): """ diff --git a/pandas/tests/frame/indexing/test_categorical.py b/pandas/tests/frame/indexing/test_categorical.py index f5b3f980cc534..d94dc8d2ffe00 100644 --- a/pandas/tests/frame/indexing/test_categorical.py +++ b/pandas/tests/frame/indexing/test_categorical.py @@ -45,7 +45,7 @@ def test_assignment(self): result1 = df["D"] result2 = df["E"] - tm.assert_categorical_equal(result1._data._block.values, d) + tm.assert_categorical_equal(result1._mgr._block.values, d) # sorting s.name = "E" @@ -391,11 +391,3 @@ def test_loc_indexing_preserves_index_category_dtype(self): result = df.loc[["a"]].index.levels[0] tm.assert_index_equal(result, expected) - - def test_wrong_length_cat_dtype_raises(self): - # GH29523 - cat = pd.Categorical.from_codes([0, 1, 1, 0, 1, 2], ["a", "b", "c"]) - df = pd.DataFrame({"bar": range(10)}) - err = "Length of values does not match length of index" - with pytest.raises(ValueError, match=err): - df["foo"] = cat diff --git a/pandas/tests/frame/indexing/test_datetime.py b/pandas/tests/frame/indexing/test_datetime.py index 6bfcac3793584..1937a4c380dc9 100644 --- a/pandas/tests/frame/indexing/test_datetime.py +++ b/pandas/tests/frame/indexing/test_datetime.py @@ -20,8 +20,8 @@ def test_setitem(self, timezone_frame): # assert that A & C are not sharing the same base (e.g. they # are copies) - b1 = df._data.blocks[1] - b2 = df._data.blocks[2] + b1 = df._mgr.blocks[1] + b2 = df._mgr.blocks[2] tm.assert_extension_array_equal(b1.values, b2.values) assert id(b1.values._data.base) != id(b2.values._data.base) @@ -40,16 +40,7 @@ def test_set_reset(self): # set/reset df = DataFrame({"A": [0, 1, 2]}, index=idx) result = df.reset_index() - assert result["foo"].dtype, "M8[ns, US/Eastern" + assert result["foo"].dtype == "datetime64[ns, US/Eastern]" df = result.set_index("foo") tm.assert_index_equal(df.index, idx) - - def test_scalar_assignment(self): - # issue #19843 - df = pd.DataFrame(index=(0, 1, 2)) - df["now"] = pd.Timestamp("20130101", tz="UTC") - expected = pd.DataFrame( - {"now": pd.Timestamp("20130101", tz="UTC")}, index=[0, 1, 2] - ) - tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/indexing/test_delitem.py b/pandas/tests/frame/indexing/test_delitem.py new file mode 100644 index 0000000000000..f6c7b6ed5d14d --- /dev/null +++ b/pandas/tests/frame/indexing/test_delitem.py @@ -0,0 +1,57 @@ +import re + +import numpy as np +import pytest + +from pandas import DataFrame, MultiIndex + + +class TestDataFrameDelItem: + def test_delitem(self, float_frame): + del float_frame["A"] + assert "A" not in float_frame + + def test_delitem_multiindex(self): + midx = MultiIndex.from_product([["A", "B"], [1, 2]]) + df = DataFrame(np.random.randn(4, 4), columns=midx) + assert len(df.columns) == 4 + assert ("A",) in df.columns + assert "A" in df.columns + + result = df["A"] + assert isinstance(result, DataFrame) + del df["A"] + + assert len(df.columns) == 2 + + # A still in the levels, BUT get a KeyError if trying + # to delete + assert ("A",) not in df.columns + with pytest.raises(KeyError, match=re.escape("('A',)")): + del df[("A",)] + + # behavior of dropped/deleted MultiIndex levels changed from + # GH 2770 to GH 19027: MultiIndex no longer '.__contains__' + # levels which are dropped/deleted + assert "A" not in df.columns + with pytest.raises(KeyError, match=re.escape("('A',)")): + del df["A"] + + def test_delitem_corner(self, float_frame): + f = float_frame.copy() + del f["D"] + assert len(f.columns) == 3 + with pytest.raises(KeyError, match=r"^'D'$"): + del f["D"] + del f["B"] + assert len(f.columns) == 2 + + def test_delitem_col_still_multiindex(self): + arrays = [["a", "b", "c", "top"], ["", "", "", "OD"], ["", "", "", "wx"]] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + + df = DataFrame(np.random.randn(3, 4), columns=index) + del df[("a", "", "")] + assert isinstance(df.columns, MultiIndex) diff --git a/pandas/tests/frame/indexing/test_get.py b/pandas/tests/frame/indexing/test_get.py new file mode 100644 index 0000000000000..5f2651eec683c --- /dev/null +++ b/pandas/tests/frame/indexing/test_get.py @@ -0,0 +1,27 @@ +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +class TestGet: + def test_get(self, float_frame): + b = float_frame.get("B") + tm.assert_series_equal(b, float_frame["B"]) + + assert float_frame.get("foo") is None + tm.assert_series_equal( + float_frame.get("foo", float_frame["B"]), float_frame["B"] + ) + + @pytest.mark.parametrize( + "df", + [ + DataFrame(), + DataFrame(columns=list("AB")), + DataFrame(columns=list("AB"), index=range(3)), + ], + ) + def test_get_none(self, df): + # see gh-5652 + assert df.get(None) is None diff --git a/pandas/tests/frame/indexing/test_iat.py b/pandas/tests/frame/indexing/test_iat.py index 23e3392251a3a..b1025b99e9bd5 100644 --- a/pandas/tests/frame/indexing/test_iat.py +++ b/pandas/tests/frame/indexing/test_iat.py @@ -1,3 +1,6 @@ +import pandas as pd + + def test_iat(float_frame): for i, row in enumerate(float_frame.index): @@ -5,3 +8,9 @@ def test_iat(float_frame): result = float_frame.iat[i, j] expected = float_frame.at[row, col] assert result == expected + + +def test_iat_duplicate_columns(): + # https://github.com/pandas-dev/pandas/issues/11754 + df = pd.DataFrame([[1, 2]], columns=["x", "x"]) + assert df.iat[0, 0] == 1 diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 636cca0df9d4e..ed3c4689c92d9 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -27,28 +27,8 @@ from pandas.tseries.offsets import BDay - -class TestGet: - def test_get(self, float_frame): - b = float_frame.get("B") - tm.assert_series_equal(b, float_frame["B"]) - - assert float_frame.get("foo") is None - tm.assert_series_equal( - float_frame.get("foo", float_frame["B"]), float_frame["B"] - ) - - @pytest.mark.parametrize( - "df", - [ - DataFrame(), - DataFrame(columns=list("AB")), - DataFrame(columns=list("AB"), index=range(3)), - ], - ) - def test_get_none(self, df): - # see gh-5652 - assert df.get(None) is None +# We pass through a TypeError raised by numpy +_slice_msg = "slice indices must be integers or None or have an __index__ method" class TestDataFrameIndexing: @@ -212,6 +192,63 @@ def test_setitem_list_of_tuples(self, float_frame): expected = Series(tuples, index=float_frame.index, name="tuples") tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "columns,box,expected", + [ + ( + ["A", "B", "C", "D"], + 7, + pd.DataFrame( + [[7, 7, 7, 7], [7, 7, 7, 7], [7, 7, 7, 7]], + columns=["A", "B", "C", "D"], + ), + ), + ( + ["C", "D"], + [7, 8], + pd.DataFrame( + [[1, 2, 7, 8], [3, 4, 7, 8], [5, 6, 7, 8]], + columns=["A", "B", "C", "D"], + ), + ), + ( + ["A", "B", "C"], + np.array([7, 8, 9], dtype=np.int64), + pd.DataFrame( + [[7, 8, 9], [7, 8, 9], [7, 8, 9]], columns=["A", "B", "C"] + ), + ), + ( + ["B", "C", "D"], + [[7, 8, 9], [10, 11, 12], [13, 14, 15]], + pd.DataFrame( + [[1, 7, 8, 9], [3, 10, 11, 12], [5, 13, 14, 15]], + columns=["A", "B", "C", "D"], + ), + ), + ( + ["C", "A", "D"], + np.array([[7, 8, 9], [10, 11, 12], [13, 14, 15]], dtype=np.int64), + pd.DataFrame( + [[8, 2, 7, 9], [11, 4, 10, 12], [14, 6, 13, 15]], + columns=["A", "B", "C", "D"], + ), + ), + ( + ["A", "C"], + pd.DataFrame([[7, 8], [9, 10], [11, 12]], columns=["A", "C"]), + pd.DataFrame( + [[7, 2, 8], [9, 4, 10], [11, 6, 12]], columns=["A", "B", "C"] + ), + ), + ], + ) + def test_setitem_list_missing_columns(self, columns, box, expected): + # GH 29334 + df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"]) + df[columns] = box + tm.assert_frame_equal(df, expected) + def test_setitem_multi_index(self): # GH7655, test that assigning to a sub-frame of a frame # with multi-index columns aligns both rows and columns @@ -355,16 +392,6 @@ def test_getitem_boolean_casting(self, datetime_frame): ) tm.assert_series_equal(result, expected) - # where dtype conversions - # GH 3733 - df = DataFrame(data=np.random.randn(100, 50)) - df = df.where(df > 0) # create nans - bools = df > 0 - mask = isna(df) - expected = bools.astype(float).mask(mask) - result = bools.mask(mask) - tm.assert_frame_equal(result, expected) - def test_getitem_boolean_list(self): df = DataFrame(np.arange(12).reshape(3, 4)) @@ -456,13 +483,6 @@ def test_setitem(self, float_frame): float_frame["col6"] = series tm.assert_series_equal(series, float_frame["col6"], check_names=False) - msg = ( - r"\"None of \[Float64Index\(\[.*dtype='float64'\)\] are in the " - r"\[columns\]\"" - ) - with pytest.raises(KeyError, match=msg): - float_frame[np.random.randn(len(float_frame) + 1)] = 1 - # set ndarray arr = np.random.randn(len(float_frame)) float_frame["col9"] = arr @@ -801,15 +821,6 @@ def test_getitem_empty_frame_with_boolean(self): df2 = df[df > 0] tm.assert_frame_equal(df, df2) - def test_delitem_corner(self, float_frame): - f = float_frame.copy() - del f["D"] - assert len(f.columns) == 3 - with pytest.raises(KeyError, match=r"^'D'$"): - del f["D"] - del f["B"] - assert len(f.columns) == 2 - def test_slice_floats(self): index = [52195.504153, 52196.303147, 52198.369883] df = DataFrame(np.random.rand(3, 2), index=index) @@ -994,7 +1005,8 @@ def test_getitem_setitem_fancy_exceptions(self, float_frame): with pytest.raises(IndexingError, match="Too many indexers"): ix[:, :, :] - with pytest.raises(IndexingError, match="Too many indexers"): + with pytest.raises(IndexError, match="too many indices for array"): + # GH#32257 we let numpy do validation, get their exception ix[:, :, :] = 1 def test_getitem_setitem_boolean_misaligned(self, float_frame): @@ -1073,7 +1085,7 @@ def test_getitem_setitem_float_labels(self): cp = df.copy() - with pytest.raises(TypeError, match=msg): + with pytest.raises(TypeError, match=_slice_msg): cp.iloc[1.0:5] = 0 with pytest.raises(TypeError, match=msg): @@ -1205,7 +1217,7 @@ def test_setitem_frame_mixed(self, float_string_frame): piece = DataFrame( [[1.0, 2.0], [3.0, 4.0]], index=f.index[0:2], columns=["A", "B"] ) - key = (slice(None, 2), ["A", "B"]) + key = (f.index[slice(None, 2)], ["A", "B"]) f.loc[key] = piece tm.assert_almost_equal(f.loc[f.index[0:2], ["A", "B"]].values, piece.values) @@ -1216,7 +1228,7 @@ def test_setitem_frame_mixed(self, float_string_frame): index=list(f.index[0:2]) + ["foo", "bar"], columns=["A", "B"], ) - key = (slice(None, 2), ["A", "B"]) + key = (f.index[slice(None, 2)], ["A", "B"]) f.loc[key] = piece tm.assert_almost_equal( f.loc[f.index[0:2:], ["A", "B"]].values, piece.values[0:2] @@ -1226,7 +1238,7 @@ def test_setitem_frame_mixed(self, float_string_frame): f = float_string_frame.copy() piece = f.loc[f.index[:2], ["A"]] piece.index = f.index[-2:] - key = (slice(-2, None), ["A", "B"]) + key = (f.index[slice(-2, None)], ["A", "B"]) f.loc[key] = piece piece["B"] = np.nan tm.assert_almost_equal(f.loc[f.index[-2:], ["A", "B"]].values, piece.values) @@ -1234,7 +1246,7 @@ def test_setitem_frame_mixed(self, float_string_frame): # ndarray f = float_string_frame.copy() piece = float_string_frame.loc[f.index[:2], ["A", "B"]] - key = (slice(-2, None), ["A", "B"]) + key = (f.index[slice(-2, None)], ["A", "B"]) f.loc[key] = piece.values tm.assert_almost_equal(f.loc[f.index[-2:], ["A", "B"]].values, piece.values) @@ -1370,6 +1382,24 @@ def test_lookup_raises(self, float_frame): with pytest.raises(ValueError, match="same size"): float_frame.lookup(["a", "b", "c"], ["a"]) + def test_lookup_requires_unique_axes(self): + # GH#33041 raise with a helpful error message + df = pd.DataFrame(np.random.randn(6).reshape(3, 2), columns=["A", "A"]) + + rows = [0, 1] + cols = ["A", "A"] + + # homogeneous-dtype case + with pytest.raises(ValueError, match="requires unique index and columns"): + df.lookup(rows, cols) + with pytest.raises(ValueError, match="requires unique index and columns"): + df.T.lookup(cols, rows) + + # heterogeneous dtype + df["B"] = 0 + with pytest.raises(ValueError, match="requires unique index and columns"): + df.lookup(rows, cols) + def test_set_value(self, float_frame): for idx in float_frame.index: for col in float_frame.columns: @@ -1402,6 +1432,81 @@ def test_set_value_resize(self, float_frame): with pytest.raises(ValueError, match=msg): res._set_value("foobar", "baz", "sam") + def test_reindex_with_multi_index(self): + # https://github.com/pandas-dev/pandas/issues/29896 + # tests for reindexing a multi-indexed DataFrame with a new MultiIndex + # + # confirms that we can reindex a multi-indexed DataFrame with a new + # MultiIndex object correctly when using no filling, backfilling, and + # padding + # + # The DataFrame, `df`, used in this test is: + # c + # a b + # -1 0 A + # 1 B + # 2 C + # 3 D + # 4 E + # 5 F + # 6 G + # 0 0 A + # 1 B + # 2 C + # 3 D + # 4 E + # 5 F + # 6 G + # 1 0 A + # 1 B + # 2 C + # 3 D + # 4 E + # 5 F + # 6 G + # + # and the other MultiIndex, `new_multi_index`, is: + # 0: 0 0.5 + # 1: 2.0 + # 2: 5.0 + # 3: 5.8 + df = pd.DataFrame( + { + "a": [-1] * 7 + [0] * 7 + [1] * 7, + "b": list(range(7)) * 3, + "c": ["A", "B", "C", "D", "E", "F", "G"] * 3, + } + ).set_index(["a", "b"]) + new_index = [0.5, 2.0, 5.0, 5.8] + new_multi_index = MultiIndex.from_product([[0], new_index], names=["a", "b"]) + + # reindexing w/o a `method` value + reindexed = df.reindex(new_multi_index) + expected = pd.DataFrame( + {"a": [0] * 4, "b": new_index, "c": [np.nan, "C", "F", np.nan]} + ).set_index(["a", "b"]) + tm.assert_frame_equal(expected, reindexed) + + # reindexing with backfilling + expected = pd.DataFrame( + {"a": [0] * 4, "b": new_index, "c": ["B", "C", "F", "G"]} + ).set_index(["a", "b"]) + reindexed_with_backfilling = df.reindex(new_multi_index, method="bfill") + tm.assert_frame_equal(expected, reindexed_with_backfilling) + + reindexed_with_backfilling = df.reindex(new_multi_index, method="backfill") + tm.assert_frame_equal(expected, reindexed_with_backfilling) + + # reindexing with padding + expected = pd.DataFrame( + {"a": [0] * 4, "b": new_index, "c": ["A", "C", "F", "F"]} + ).set_index(["a", "b"]) + reindexed_with_padding = df.reindex(new_multi_index, method="pad") + tm.assert_frame_equal(expected, reindexed_with_padding) + + reindexed_with_padding = df.reindex(new_multi_index, method="ffill") + tm.assert_frame_equal(expected, reindexed_with_padding) + def test_set_value_with_index_dtype_change(self): df_orig = DataFrame(np.random.randn(3, 3), index=range(3), columns=list("ABC")) @@ -1576,11 +1681,6 @@ def test_reindex_methods(self, method, expected_values): actual = df.reindex(target, method=method) tm.assert_frame_equal(expected, actual) - actual = df.reindex_like(df, method=method, tolerance=0) - tm.assert_frame_equal(df, actual) - actual = df.reindex_like(df, method=method, tolerance=[0, 0, 0, 0]) - tm.assert_frame_equal(df, actual) - actual = df.reindex(target, method=method, tolerance=1) tm.assert_frame_equal(expected, actual) actual = df.reindex(target, method=method, tolerance=[1, 1, 1, 1]) @@ -1821,22 +1921,6 @@ def test_getitem_sparse_column(self): result = df.loc[:, "A"] tm.assert_series_equal(result, expected) - def test_setitem_with_sparse_value(self): - # GH8131 - df = pd.DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]}) - sp_array = SparseArray([0, 0, 1]) - df["new_column"] = sp_array - tm.assert_series_equal( - df["new_column"], pd.Series(sp_array, name="new_column"), check_names=False - ) - - def test_setitem_with_unaligned_sparse_value(self): - df = pd.DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]}) - sp_series = pd.Series(SparseArray([0, 0, 1]), index=[2, 1, 0]) - df["new_column"] = sp_series - exp = pd.Series(SparseArray([1, 0, 0]), name="new_column") - tm.assert_series_equal(df["new_column"], exp) - def test_setitem_with_unaligned_tz_aware_datetime_column(self): # GH 12981 # Assignment of unaligned offset-aware datetime series. @@ -1869,7 +1953,7 @@ def test_setitem_datetimelike_with_inference(self): df = DataFrame(index=date_range("20130101", periods=4)) df["A"] = np.array([1 * one_hour] * 4, dtype="m8[ns]") df.loc[:, "B"] = np.array([2 * one_hour] * 4, dtype="m8[ns]") - df.loc[:3, "C"] = np.array([3 * one_hour] * 3, dtype="m8[ns]") + df.loc[df.index[:3], "C"] = np.array([3 * one_hour] * 3, dtype="m8[ns]") df.loc[:, "D"] = np.array([4 * one_hour] * 4, dtype="m8[ns]") df.loc[df.index[:3], "E"] = np.array([5 * one_hour] * 3, dtype="m8[ns]") df["F"] = np.timedelta64("NaT") @@ -2031,69 +2115,6 @@ def test_boolean_indexing_mixed(self): with pytest.raises(TypeError, match=msg): df[df > 0.3] = 1 - def test_mask(self): - df = DataFrame(np.random.randn(5, 3)) - cond = df > 0 - - rs = df.where(cond, np.nan) - tm.assert_frame_equal(rs, df.mask(df <= 0)) - tm.assert_frame_equal(rs, df.mask(~cond)) - - other = DataFrame(np.random.randn(5, 3)) - rs = df.where(cond, other) - tm.assert_frame_equal(rs, df.mask(df <= 0, other)) - tm.assert_frame_equal(rs, df.mask(~cond, other)) - - # see gh-21891 - df = DataFrame([1, 2]) - res = df.mask([[True], [False]]) - - exp = DataFrame([np.nan, 2]) - tm.assert_frame_equal(res, exp) - - def test_mask_inplace(self): - # GH8801 - df = DataFrame(np.random.randn(5, 3)) - cond = df > 0 - - rdf = df.copy() - - rdf.where(cond, inplace=True) - tm.assert_frame_equal(rdf, df.where(cond)) - tm.assert_frame_equal(rdf, df.mask(~cond)) - - rdf = df.copy() - rdf.where(cond, -df, inplace=True) - tm.assert_frame_equal(rdf, df.where(cond, -df)) - tm.assert_frame_equal(rdf, df.mask(~cond, -df)) - - def test_mask_edge_case_1xN_frame(self): - # GH4071 - df = DataFrame([[1, 2]]) - res = df.mask(DataFrame([[True, False]])) - expec = DataFrame([[np.nan, 2]]) - tm.assert_frame_equal(res, expec) - - def test_mask_callable(self): - # GH 12533 - df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - result = df.mask(lambda x: x > 4, lambda x: x + 1) - exp = DataFrame([[1, 2, 3], [4, 6, 7], [8, 9, 10]]) - tm.assert_frame_equal(result, exp) - tm.assert_frame_equal(result, df.mask(df > 4, df + 1)) - - # return ndarray and scalar - result = df.mask(lambda x: (x % 2 == 0).values, lambda x: 99) - exp = DataFrame([[1, 99, 3], [99, 5, 99], [7, 99, 9]]) - tm.assert_frame_equal(result, exp) - tm.assert_frame_equal(result, df.mask(df % 2 == 0, 99)) - - # chain - result = (df + 2).mask(lambda x: x > 8, lambda x: x + 10) - exp = DataFrame([[3, 4, 5], [6, 7, 8], [19, 20, 21]]) - tm.assert_frame_equal(result, exp) - tm.assert_frame_equal(result, (df + 2).mask((df + 2) > 8, (df + 2) + 10)) - def test_type_error_multiindex(self): # See gh-12218 df = DataFrame( @@ -2208,17 +2229,18 @@ def test_object_casting_indexing_wraps_datetimelike(): assert isinstance(ser.values[1], pd.Timestamp) assert isinstance(ser.values[2], pd.Timedelta) - mgr = df._data + mgr = df._mgr + mgr._rebuild_blknos_and_blklocs() arr = mgr.fast_xs(0) assert isinstance(arr[1], pd.Timestamp) assert isinstance(arr[2], pd.Timedelta) - blk = mgr.blocks[mgr._blknos[1]] + blk = mgr.blocks[mgr.blknos[1]] assert blk.dtype == "M8[ns]" # we got the right block val = blk.iget((0, 0)) assert isinstance(val, pd.Timestamp) - blk = mgr.blocks[mgr._blknos[2]] + blk = mgr.blocks[mgr.blknos[2]] assert blk.dtype == "m8[ns]" # we got the right block val = blk.iget((0, 0)) assert isinstance(val, pd.Timedelta) diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py new file mode 100644 index 0000000000000..622c93d1c2fdc --- /dev/null +++ b/pandas/tests/frame/indexing/test_insert.py @@ -0,0 +1,68 @@ +""" +test_insert is specifically for the DataFrame.insert method; not to be +confused with tests with "insert" in their names that are really testing +__setitem__. +""" +import numpy as np +import pytest + +from pandas import DataFrame, Index +import pandas._testing as tm + + +class TestDataFrameInsert: + def test_insert(self): + df = DataFrame( + np.random.randn(5, 3), index=np.arange(5), columns=["c", "b", "a"] + ) + + df.insert(0, "foo", df["a"]) + tm.assert_index_equal(df.columns, Index(["foo", "c", "b", "a"])) + tm.assert_series_equal(df["a"], df["foo"], check_names=False) + + df.insert(2, "bar", df["c"]) + tm.assert_index_equal(df.columns, Index(["foo", "c", "bar", "b", "a"])) + tm.assert_almost_equal(df["c"], df["bar"], check_names=False) + + with pytest.raises(ValueError, match="already exists"): + df.insert(1, "a", df["b"]) + + msg = "cannot insert c, already exists" + with pytest.raises(ValueError, match=msg): + df.insert(1, "c", df["b"]) + + df.columns.name = "some_name" + # preserve columns name field + df.insert(0, "baz", df["c"]) + assert df.columns.name == "some_name" + + def test_insert_column_bug_4032(self): + + # GH#4032, inserting a column and renaming causing errors + df = DataFrame({"b": [1.1, 2.2]}) + + df = df.rename(columns={}) + df.insert(0, "a", [1, 2]) + result = df.rename(columns={}) + + str(result) + expected = DataFrame([[1, 1.1], [2, 2.2]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + df.insert(0, "c", [1.3, 2.3]) + result = df.rename(columns={}) + + str(result) + expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], columns=["c", "a", "b"]) + tm.assert_frame_equal(result, expected) + + def test_insert_with_columns_dups(self): + # GH#14291 + df = DataFrame() + df.insert(0, "A", ["g", "h", "i"], allow_duplicates=True) + df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True) + df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True) + exp = DataFrame( + [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"] + ) + tm.assert_frame_equal(df, exp) diff --git a/pandas/tests/frame/indexing/test_mask.py b/pandas/tests/frame/indexing/test_mask.py new file mode 100644 index 0000000000000..30db6110efc80 --- /dev/null +++ b/pandas/tests/frame/indexing/test_mask.py @@ -0,0 +1,83 @@ +""" +Tests for DataFrame.mask; tests DataFrame.where as a side-effect. +""" + +import numpy as np + +from pandas import DataFrame, isna +import pandas._testing as tm + + +class TestDataFrameMask: + def test_mask(self): + df = DataFrame(np.random.randn(5, 3)) + cond = df > 0 + + rs = df.where(cond, np.nan) + tm.assert_frame_equal(rs, df.mask(df <= 0)) + tm.assert_frame_equal(rs, df.mask(~cond)) + + other = DataFrame(np.random.randn(5, 3)) + rs = df.where(cond, other) + tm.assert_frame_equal(rs, df.mask(df <= 0, other)) + tm.assert_frame_equal(rs, df.mask(~cond, other)) + + # see GH#21891 + df = DataFrame([1, 2]) + res = df.mask([[True], [False]]) + + exp = DataFrame([np.nan, 2]) + tm.assert_frame_equal(res, exp) + + def test_mask_inplace(self): + # GH#8801 + df = DataFrame(np.random.randn(5, 3)) + cond = df > 0 + + rdf = df.copy() + + rdf.where(cond, inplace=True) + tm.assert_frame_equal(rdf, df.where(cond)) + tm.assert_frame_equal(rdf, df.mask(~cond)) + + rdf = df.copy() + rdf.where(cond, -df, inplace=True) + tm.assert_frame_equal(rdf, df.where(cond, -df)) + tm.assert_frame_equal(rdf, df.mask(~cond, -df)) + + def test_mask_edge_case_1xN_frame(self): + # GH#4071 + df = DataFrame([[1, 2]]) + res = df.mask(DataFrame([[True, False]])) + expec = DataFrame([[np.nan, 2]]) + tm.assert_frame_equal(res, expec) + + def test_mask_callable(self): + # GH#12533 + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + result = df.mask(lambda x: x > 4, lambda x: x + 1) + exp = DataFrame([[1, 2, 3], [4, 6, 7], [8, 9, 10]]) + tm.assert_frame_equal(result, exp) + tm.assert_frame_equal(result, df.mask(df > 4, df + 1)) + + # return ndarray and scalar + result = df.mask(lambda x: (x % 2 == 0).values, lambda x: 99) + exp = DataFrame([[1, 99, 3], [99, 5, 99], [7, 99, 9]]) + tm.assert_frame_equal(result, exp) + tm.assert_frame_equal(result, df.mask(df % 2 == 0, 99)) + + # chain + result = (df + 2).mask(lambda x: x > 8, lambda x: x + 10) + exp = DataFrame([[3, 4, 5], [6, 7, 8], [19, 20, 21]]) + tm.assert_frame_equal(result, exp) + tm.assert_frame_equal(result, (df + 2).mask((df + 2) > 8, (df + 2) + 10)) + + def test_mask_dtype_conversion(self): + # GH#3733 + df = DataFrame(data=np.random.randn(100, 50)) + df = df.where(df > 0) # create nans + bools = df > 0 + mask = isna(df) + expected = bools.astype(float).mask(mask) + result = bools.mask(mask) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py new file mode 100644 index 0000000000000..d53665539309c --- /dev/null +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -0,0 +1,128 @@ +import numpy as np +import pytest + +from pandas import Categorical, DataFrame, Index, Series, Timestamp, date_range +import pandas._testing as tm +from pandas.core.arrays import SparseArray + + +class TestDataFrameSetItem: + def test_setitem_error_msmgs(self): + + # GH 7432 + df = DataFrame( + {"bar": [1, 2, 3], "baz": ["d", "e", "f"]}, + index=Index(["a", "b", "c"], name="foo"), + ) + ser = Series( + ["g", "h", "i", "j"], + index=Index(["a", "b", "c", "a"], name="foo"), + name="fiz", + ) + msg = "cannot reindex from a duplicate axis" + with pytest.raises(ValueError, match=msg): + df["newcol"] = ser + + # GH 4107, more descriptive error message + df = DataFrame(np.random.randint(0, 2, (4, 4)), columns=["a", "b", "c", "d"]) + + msg = "incompatible index of inserted column with frame index" + with pytest.raises(TypeError, match=msg): + df["gr"] = df.groupby(["b", "c"]).count() + + def test_setitem_benchmark(self): + # from the vb_suite/frame_methods/frame_insert_columns + N = 10 + K = 5 + df = DataFrame(index=range(N)) + new_col = np.random.randn(N) + for i in range(K): + df[i] = new_col + expected = DataFrame(np.repeat(new_col, K).reshape(N, K), index=range(N)) + tm.assert_frame_equal(df, expected) + + def test_setitem_different_dtype(self): + df = DataFrame( + np.random.randn(5, 3), index=np.arange(5), columns=["c", "b", "a"] + ) + df.insert(0, "foo", df["a"]) + df.insert(2, "bar", df["c"]) + + # diff dtype + + # new item + df["x"] = df["a"].astype("float32") + result = df.dtypes + expected = Series( + [np.dtype("float64")] * 5 + [np.dtype("float32")], + index=["foo", "c", "bar", "b", "a", "x"], + ) + tm.assert_series_equal(result, expected) + + # replacing current (in different block) + df["a"] = df["a"].astype("float32") + result = df.dtypes + expected = Series( + [np.dtype("float64")] * 4 + [np.dtype("float32")] * 2, + index=["foo", "c", "bar", "b", "a", "x"], + ) + tm.assert_series_equal(result, expected) + + df["y"] = df["a"].astype("int32") + result = df.dtypes + expected = Series( + [np.dtype("float64")] * 4 + [np.dtype("float32")] * 2 + [np.dtype("int32")], + index=["foo", "c", "bar", "b", "a", "x", "y"], + ) + tm.assert_series_equal(result, expected) + + def test_setitem_empty_columns(self): + # GH 13522 + df = DataFrame(index=["A", "B", "C"]) + df["X"] = df.index + df["X"] = ["x", "y", "z"] + exp = DataFrame(data={"X": ["x", "y", "z"]}, index=["A", "B", "C"]) + tm.assert_frame_equal(df, exp) + + def test_setitem_dt64_index_empty_columns(self): + rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") + df = DataFrame(index=np.arange(len(rng))) + + df["A"] = rng + assert df["A"].dtype == np.dtype("M8[ns]") + + def test_setitem_timestamp_empty_columns(self): + # GH#19843 + df = DataFrame(index=range(3)) + df["now"] = Timestamp("20130101", tz="UTC") + + expected = DataFrame( + [[Timestamp("20130101", tz="UTC")]] * 3, index=[0, 1, 2], columns=["now"], + ) + tm.assert_frame_equal(df, expected) + + def test_setitem_wrong_length_categorical_dtype_raises(self): + # GH#29523 + cat = Categorical.from_codes([0, 1, 1, 0, 1, 2], ["a", "b", "c"]) + df = DataFrame(range(10), columns=["bar"]) + + msg = "Length of values does not match length of index" + with pytest.raises(ValueError, match=msg): + df["foo"] = cat + + def test_setitem_with_sparse_value(self): + # GH#8131 + df = DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]}) + sp_array = SparseArray([0, 0, 1]) + df["new_column"] = sp_array + + expected = Series(sp_array, name="new_column") + tm.assert_series_equal(df["new_column"], expected) + + def test_setitem_with_unaligned_sparse_value(self): + df = DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]}) + sp_series = Series(SparseArray([0, 0, 1]), index=[2, 1, 0]) + + df["new_column"] = sp_series + expected = Series(SparseArray([1, 0, 0]), name="new_column") + tm.assert_series_equal(df["new_column"], expected) diff --git a/pandas/tests/frame/indexing/test_take.py b/pandas/tests/frame/indexing/test_take.py new file mode 100644 index 0000000000000..3b59d3cf10658 --- /dev/null +++ b/pandas/tests/frame/indexing/test_take.py @@ -0,0 +1,88 @@ +import pytest + +import pandas._testing as tm + + +class TestDataFrameTake: + def test_take(self, float_frame): + # homogeneous + order = [3, 1, 2, 0] + for df in [float_frame]: + + result = df.take(order, axis=0) + expected = df.reindex(df.index.take(order)) + tm.assert_frame_equal(result, expected) + + # axis = 1 + result = df.take(order, axis=1) + expected = df.loc[:, ["D", "B", "C", "A"]] + tm.assert_frame_equal(result, expected, check_names=False) + + # negative indices + order = [2, 1, -1] + for df in [float_frame]: + + result = df.take(order, axis=0) + expected = df.reindex(df.index.take(order)) + tm.assert_frame_equal(result, expected) + + result = df.take(order, axis=0) + tm.assert_frame_equal(result, expected) + + # axis = 1 + result = df.take(order, axis=1) + expected = df.loc[:, ["C", "B", "D"]] + tm.assert_frame_equal(result, expected, check_names=False) + + # illegal indices + msg = "indices are out-of-bounds" + with pytest.raises(IndexError, match=msg): + df.take([3, 1, 2, 30], axis=0) + with pytest.raises(IndexError, match=msg): + df.take([3, 1, 2, -31], axis=0) + with pytest.raises(IndexError, match=msg): + df.take([3, 1, 2, 5], axis=1) + with pytest.raises(IndexError, match=msg): + df.take([3, 1, 2, -5], axis=1) + + def test_take_mixed_type(self, float_string_frame): + + # mixed-dtype + order = [4, 1, 2, 0, 3] + for df in [float_string_frame]: + + result = df.take(order, axis=0) + expected = df.reindex(df.index.take(order)) + tm.assert_frame_equal(result, expected) + + # axis = 1 + result = df.take(order, axis=1) + expected = df.loc[:, ["foo", "B", "C", "A", "D"]] + tm.assert_frame_equal(result, expected) + + # negative indices + order = [4, 1, -2] + for df in [float_string_frame]: + + result = df.take(order, axis=0) + expected = df.reindex(df.index.take(order)) + tm.assert_frame_equal(result, expected) + + # axis = 1 + result = df.take(order, axis=1) + expected = df.loc[:, ["foo", "B", "D"]] + tm.assert_frame_equal(result, expected) + + def test_take_mixed_numeric(self, mixed_float_frame, mixed_int_frame): + # by dtype + order = [1, 2, 0, 3] + for df in [mixed_float_frame, mixed_int_frame]: + + result = df.take(order, axis=0) + expected = df.reindex(df.index.take(order)) + tm.assert_frame_equal(result, expected) + + # axis = 1 + result = df.take(order, axis=1) + expected = df.loc[:, ["B", "C", "A", "D"]] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index eee754a47fb8c..24eb424bd5735 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -397,7 +397,8 @@ def test_where_none(self): def test_where_empty_df_and_empty_cond_having_non_bool_dtypes(self): # see gh-21947 df = pd.DataFrame(columns=["a"]) - cond = df.applymap(lambda x: x > 0) + cond = df + assert (cond.dtypes == object).all() result = df.where(cond) tm.assert_frame_equal(result, df) @@ -590,3 +591,40 @@ def test_where_tz_values(self, tz_naive_fixture): ) result = df1.where(mask, df2) tm.assert_frame_equal(exp, result) + + def test_df_where_change_dtype(self): + # GH#16979 + df = DataFrame(np.arange(2 * 3).reshape(2, 3), columns=list("ABC")) + mask = np.array([[True, False, False], [False, False, True]]) + + result = df.where(mask) + expected = DataFrame( + [[0, np.nan, np.nan], [np.nan, np.nan, 5]], columns=list("ABC") + ) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("kwargs", [dict(), dict(other=None)]) + def test_df_where_with_category(self, kwargs): + # GH#16979 + df = DataFrame(np.arange(2 * 3).reshape(2, 3), columns=list("ABC")) + mask = np.array([[True, False, False], [False, False, True]]) + + # change type to category + df.A = df.A.astype("category") + df.B = df.B.astype("category") + df.C = df.C.astype("category") + + result = df.where(mask, **kwargs) + A = pd.Categorical([0, np.nan], categories=[0, 3]) + B = pd.Categorical([np.nan, np.nan], categories=[1, 4]) + C = pd.Categorical([np.nan, 5], categories=[2, 5]) + expected = DataFrame({"A": A, "B": B, "C": C}) + + tm.assert_frame_equal(result, expected) + + # Check Series.where while we're here + result = df.A.where(mask[:, 0], **kwargs) + expected = Series(A, name="A") + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_align.py b/pandas/tests/frame/methods/test_align.py new file mode 100644 index 0000000000000..5dae719283d17 --- /dev/null +++ b/pandas/tests/frame/methods/test_align.py @@ -0,0 +1,245 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, Series +import pandas._testing as tm + + +class TestDataFrameAlign: + def test_align_float(self, float_frame): + af, bf = float_frame.align(float_frame) + assert af._mgr is not float_frame._mgr + + af, bf = float_frame.align(float_frame, copy=False) + assert af._mgr is float_frame._mgr + + # axis = 0 + other = float_frame.iloc[:-5, :3] + af, bf = float_frame.align(other, axis=0, fill_value=-1) + + tm.assert_index_equal(bf.columns, other.columns) + + # test fill value + join_idx = float_frame.index.join(other.index) + diff_a = float_frame.index.difference(join_idx) + diff_b = other.index.difference(join_idx) + diff_a_vals = af.reindex(diff_a).values + diff_b_vals = bf.reindex(diff_b).values + assert (diff_a_vals == -1).all() + + af, bf = float_frame.align(other, join="right", axis=0) + tm.assert_index_equal(bf.columns, other.columns) + tm.assert_index_equal(bf.index, other.index) + tm.assert_index_equal(af.index, other.index) + + # axis = 1 + other = float_frame.iloc[:-5, :3].copy() + af, bf = float_frame.align(other, axis=1) + tm.assert_index_equal(bf.columns, float_frame.columns) + tm.assert_index_equal(bf.index, other.index) + + # test fill value + join_idx = float_frame.index.join(other.index) + diff_a = float_frame.index.difference(join_idx) + diff_b = other.index.difference(join_idx) + diff_a_vals = af.reindex(diff_a).values + + # TODO(wesm): unused? + diff_b_vals = bf.reindex(diff_b).values # noqa + + assert (diff_a_vals == -1).all() + + af, bf = float_frame.align(other, join="inner", axis=1) + tm.assert_index_equal(bf.columns, other.columns) + + af, bf = float_frame.align(other, join="inner", axis=1, method="pad") + tm.assert_index_equal(bf.columns, other.columns) + + af, bf = float_frame.align( + other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=None + ) + tm.assert_index_equal(bf.index, Index([])) + + af, bf = float_frame.align( + other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 + ) + tm.assert_index_equal(bf.index, Index([])) + + # Try to align DataFrame to Series along bad axis + msg = "No axis named 2 for object type DataFrame" + with pytest.raises(ValueError, match=msg): + float_frame.align(af.iloc[0, :3], join="inner", axis=2) + + # align dataframe to series with broadcast or not + idx = float_frame.index + s = Series(range(len(idx)), index=idx) + + left, right = float_frame.align(s, axis=0) + tm.assert_index_equal(left.index, float_frame.index) + tm.assert_index_equal(right.index, float_frame.index) + assert isinstance(right, Series) + + left, right = float_frame.align(s, broadcast_axis=1) + tm.assert_index_equal(left.index, float_frame.index) + expected = {c: s for c in float_frame.columns} + expected = DataFrame( + expected, index=float_frame.index, columns=float_frame.columns + ) + tm.assert_frame_equal(right, expected) + + # see gh-9558 + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + result = df[df["a"] == 2] + expected = DataFrame([[2, 5]], index=[1], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + result = df.where(df["a"] == 2, 0) + expected = DataFrame({"a": [0, 2, 0], "b": [0, 5, 0]}) + tm.assert_frame_equal(result, expected) + + def test_align_int(self, int_frame): + # test other non-float types + other = DataFrame(index=range(5), columns=["A", "B", "C"]) + + af, bf = int_frame.align(other, join="inner", axis=1, method="pad") + tm.assert_index_equal(bf.columns, other.columns) + + def test_align_mixed_type(self, float_string_frame): + + af, bf = float_string_frame.align( + float_string_frame, join="inner", axis=1, method="pad" + ) + tm.assert_index_equal(bf.columns, float_string_frame.columns) + + def test_align_mixed_float(self, mixed_float_frame): + # mixed floats/ints + other = DataFrame(index=range(5), columns=["A", "B", "C"]) + + af, bf = mixed_float_frame.align( + other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 + ) + tm.assert_index_equal(bf.index, Index([])) + + def test_align_mixed_int(self, mixed_int_frame): + other = DataFrame(index=range(5), columns=["A", "B", "C"]) + + af, bf = mixed_int_frame.align( + other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 + ) + tm.assert_index_equal(bf.index, Index([])) + + def test_align_multiindex(self): + # GH#10665 + # same test cases as test_align_multiindex in test_series.py + + midx = pd.MultiIndex.from_product( + [range(2), range(3), range(2)], names=("a", "b", "c") + ) + idx = pd.Index(range(2), name="b") + df1 = pd.DataFrame(np.arange(12, dtype="int64"), index=midx) + df2 = pd.DataFrame(np.arange(2, dtype="int64"), index=idx) + + # these must be the same results (but flipped) + res1l, res1r = df1.align(df2, join="left") + res2l, res2r = df2.align(df1, join="right") + + expl = df1 + tm.assert_frame_equal(expl, res1l) + tm.assert_frame_equal(expl, res2r) + expr = pd.DataFrame([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx) + tm.assert_frame_equal(expr, res1r) + tm.assert_frame_equal(expr, res2l) + + res1l, res1r = df1.align(df2, join="right") + res2l, res2r = df2.align(df1, join="left") + + exp_idx = pd.MultiIndex.from_product( + [range(2), range(2), range(2)], names=("a", "b", "c") + ) + expl = pd.DataFrame([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) + tm.assert_frame_equal(expl, res1l) + tm.assert_frame_equal(expl, res2r) + expr = pd.DataFrame([0, 0, 1, 1] * 2, index=exp_idx) + tm.assert_frame_equal(expr, res1r) + tm.assert_frame_equal(expr, res2l) + + def test_align_series_combinations(self): + df = pd.DataFrame({"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE")) + s = pd.Series([1, 2, 4], index=list("ABD"), name="x") + + # frame + series + res1, res2 = df.align(s, axis=0) + exp1 = pd.DataFrame( + {"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]}, + index=list("ABCDE"), + ) + exp2 = pd.Series([1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x") + + tm.assert_frame_equal(res1, exp1) + tm.assert_series_equal(res2, exp2) + + # series + frame + res1, res2 = s.align(df) + tm.assert_series_equal(res1, exp2) + tm.assert_frame_equal(res2, exp1) + + def _check_align(self, a, b, axis, fill_axis, how, method, limit=None): + aa, ab = a.align( + b, axis=axis, join=how, method=method, limit=limit, fill_axis=fill_axis + ) + + join_index, join_columns = None, None + + ea, eb = a, b + if axis is None or axis == 0: + join_index = a.index.join(b.index, how=how) + ea = ea.reindex(index=join_index) + eb = eb.reindex(index=join_index) + + if axis is None or axis == 1: + join_columns = a.columns.join(b.columns, how=how) + ea = ea.reindex(columns=join_columns) + eb = eb.reindex(columns=join_columns) + + ea = ea.fillna(axis=fill_axis, method=method, limit=limit) + eb = eb.fillna(axis=fill_axis, method=method, limit=limit) + + tm.assert_frame_equal(aa, ea) + tm.assert_frame_equal(ab, eb) + + @pytest.mark.parametrize("meth", ["pad", "bfill"]) + @pytest.mark.parametrize("ax", [0, 1, None]) + @pytest.mark.parametrize("fax", [0, 1]) + @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"]) + def test_align_fill_method(self, how, meth, ax, fax, float_frame): + df = float_frame + self._check_align_fill(df, how, meth, ax, fax) + + def _check_align_fill(self, frame, kind, meth, ax, fax): + left = frame.iloc[0:4, :10] + right = frame.iloc[2:, 6:] + empty = frame.iloc[:0, :0] + + self._check_align(left, right, axis=ax, fill_axis=fax, how=kind, method=meth) + self._check_align( + left, right, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 + ) + + # empty left + self._check_align(empty, right, axis=ax, fill_axis=fax, how=kind, method=meth) + self._check_align( + empty, right, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 + ) + + # empty right + self._check_align(left, empty, axis=ax, fill_axis=fax, how=kind, method=meth) + self._check_align( + left, empty, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 + ) + + # both empty + self._check_align(empty, empty, axis=ax, fill_axis=fax, how=kind, method=meth) + self._check_align( + empty, empty, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 + ) diff --git a/pandas/tests/frame/methods/test_asof.py b/pandas/tests/frame/methods/test_asof.py index e2b417972638e..70b42976c95a7 100644 --- a/pandas/tests/frame/methods/test_asof.py +++ b/pandas/tests/frame/methods/test_asof.py @@ -1,7 +1,17 @@ import numpy as np import pytest -from pandas import DataFrame, Period, Series, Timestamp, date_range, to_datetime +from pandas._libs.tslibs import IncompatibleFrequency + +from pandas import ( + DataFrame, + Period, + Series, + Timestamp, + date_range, + period_range, + to_datetime, +) import pandas._testing as tm @@ -21,7 +31,7 @@ class TestFrameAsof: def test_basic(self, date_range_frame): df = date_range_frame N = 50 - df.loc[15:30, "A"] = np.nan + df.loc[df.index[15:30], "A"] = np.nan dates = date_range("1/1/1990", periods=N * 3, freq="25s") result = df.asof(dates) @@ -41,7 +51,7 @@ def test_basic(self, date_range_frame): def test_subset(self, date_range_frame): N = 10 df = date_range_frame.iloc[:N].copy() - df.loc[4:8, "A"] = np.nan + df.loc[df.index[4:8], "A"] = np.nan dates = date_range("1/1/1990", periods=N * 3, freq="25s") # with a subset of A should be the same @@ -149,10 +159,20 @@ def test_is_copy(self, date_range_frame): # doesn't track the parent dataframe / doesn't give SettingWithCopy warnings df = date_range_frame N = 50 - df.loc[15:30, "A"] = np.nan + df.loc[df.index[15:30], "A"] = np.nan dates = date_range("1/1/1990", periods=N * 3, freq="25s") result = df.asof(dates) with tm.assert_produces_warning(None): result["C"] = 1 + + def test_asof_periodindex_mismatched_freq(self): + N = 50 + rng = period_range("1/1/1990", periods=N, freq="H") + df = DataFrame(np.random.randn(N), index=rng) + + # Mismatched freq + msg = "Input has different freq" + with pytest.raises(IncompatibleFrequency, match=msg): + df.asof(rng.asfreq("D")) diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index 8a75e80a12f52..b61d0d28e2fba 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -230,15 +230,15 @@ def test_describe_timedelta_values(self): tm.assert_frame_equal(result, expected) exp_repr = ( - " t1 t2\n" - "count 5 5\n" - "mean 3 days 00:00:00 0 days 03:00:00\n" - "std 1 days 13:56:50.394919 0 days 01:34:52.099788\n" - "min 1 days 00:00:00 0 days 01:00:00\n" - "25% 2 days 00:00:00 0 days 02:00:00\n" - "50% 3 days 00:00:00 0 days 03:00:00\n" - "75% 4 days 00:00:00 0 days 04:00:00\n" - "max 5 days 00:00:00 0 days 05:00:00" + " t1 t2\n" + "count 5 5\n" + "mean 3 days 00:00:00 0 days 03:00:00\n" + "std 1 days 13:56:50.394919273 0 days 01:34:52.099788303\n" + "min 1 days 00:00:00 0 days 01:00:00\n" + "25% 2 days 00:00:00 0 days 02:00:00\n" + "50% 3 days 00:00:00 0 days 03:00:00\n" + "75% 4 days 00:00:00 0 days 04:00:00\n" + "max 5 days 00:00:00 0 days 05:00:00" ) assert repr(result) == exp_repr diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py index ffdb6d41ebda5..6a9248e1cba1e 100644 --- a/pandas/tests/frame/methods/test_diff.py +++ b/pandas/tests/frame/methods/test_diff.py @@ -64,18 +64,15 @@ def test_diff_datetime_axis1(self, tz): 1: date_range("2010", freq="D", periods=2, tz=tz), } ) - if tz is None: - result = df.diff(axis=1) - expected = DataFrame( - { - 0: pd.TimedeltaIndex(["NaT", "NaT"]), - 1: pd.TimedeltaIndex(["0 days", "0 days"]), - } - ) - tm.assert_frame_equal(result, expected) - else: - with pytest.raises(NotImplementedError): - result = df.diff(axis=1) + + result = df.diff(axis=1) + expected = DataFrame( + { + 0: pd.TimedeltaIndex(["NaT", "NaT"]), + 1: pd.TimedeltaIndex(["0 days", "0 days"]), + } + ) + tm.assert_frame_equal(result, expected) def test_diff_timedelta(self): # GH#4533 @@ -118,3 +115,46 @@ def test_diff_axis(self): tm.assert_frame_equal( df.diff(axis=0), DataFrame([[np.nan, np.nan], [2.0, 2.0]]) ) + + @pytest.mark.xfail( + reason="GH#32995 needs to operate column-wise or do inference", + raises=AssertionError, + ) + def test_diff_period(self): + # GH#32995 Don't pass an incorrect axis + # TODO(EA2D): this bug wouldn't have happened with 2D EA + pi = pd.date_range("2016-01-01", periods=3).to_period("D") + df = pd.DataFrame({"A": pi}) + + result = df.diff(1, axis=1) + + # TODO: should we make Block.diff do type inference? or maybe algos.diff? + expected = (df - pd.NaT).astype(object) + tm.assert_frame_equal(result, expected) + + def test_diff_axis1_mixed_dtypes(self): + # GH#32995 operate column-wise when we have mixed dtypes and axis=1 + df = pd.DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)}) + + expected = pd.DataFrame({"A": [np.nan, np.nan, np.nan], "B": df["B"] / 2}) + + result = df.diff(axis=1) + tm.assert_frame_equal(result, expected) + + def test_diff_axis1_mixed_dtypes_large_periods(self): + # GH#32995 operate column-wise when we have mixed dtypes and axis=1 + df = pd.DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)}) + + expected = df * np.nan + + result = df.diff(axis=1, periods=3) + tm.assert_frame_equal(result, expected) + + def test_diff_axis1_mixed_dtypes_negative_periods(self): + # GH#32995 operate column-wise when we have mixed dtypes and axis=1 + df = pd.DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)}) + + expected = pd.DataFrame({"A": -1.0 * df["A"], "B": df["B"] * np.nan}) + + result = df.diff(axis=1, periods=-1) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py new file mode 100644 index 0000000000000..177d10cdbf615 --- /dev/null +++ b/pandas/tests/frame/methods/test_drop.py @@ -0,0 +1,419 @@ +import re + +import numpy as np +import pytest + +from pandas.errors import PerformanceWarning + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, Series, Timestamp +import pandas._testing as tm + + +@pytest.mark.parametrize( + "msg,labels,level", + [ + (r"labels \[4\] not found in level", 4, "a"), + (r"labels \[7\] not found in level", 7, "b"), + ], +) +def test_drop_raise_exception_if_labels_not_in_level(msg, labels, level): + # GH 8594 + mi = pd.MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"]) + s = pd.Series([10, 20, 30], index=mi) + df = pd.DataFrame([10, 20, 30], index=mi) + + with pytest.raises(KeyError, match=msg): + s.drop(labels, level=level) + with pytest.raises(KeyError, match=msg): + df.drop(labels, level=level) + + +@pytest.mark.parametrize("labels,level", [(4, "a"), (7, "b")]) +def test_drop_errors_ignore(labels, level): + # GH 8594 + mi = pd.MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"]) + s = pd.Series([10, 20, 30], index=mi) + df = pd.DataFrame([10, 20, 30], index=mi) + + expected_s = s.drop(labels, level=level, errors="ignore") + tm.assert_series_equal(s, expected_s) + + expected_df = df.drop(labels, level=level, errors="ignore") + tm.assert_frame_equal(df, expected_df) + + +def test_drop_with_non_unique_datetime_index_and_invalid_keys(): + # GH 30399 + + # define dataframe with unique datetime index + df = pd.DataFrame( + np.random.randn(5, 3), + columns=["a", "b", "c"], + index=pd.date_range("2012", freq="H", periods=5), + ) + # create dataframe with non-unique datetime index + df = df.iloc[[0, 2, 2, 3]].copy() + + with pytest.raises(KeyError, match="not found in axis"): + df.drop(["a", "b"]) # Dropping with labels not exist in the index + + +class TestDataFrameDrop: + def test_drop_names(self): + df = DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7]], + index=["a", "b", "c"], + columns=["d", "e", "f"], + ) + df.index.name, df.columns.name = "first", "second" + df_dropped_b = df.drop("b") + df_dropped_e = df.drop("e", axis=1) + df_inplace_b, df_inplace_e = df.copy(), df.copy() + df_inplace_b.drop("b", inplace=True) + df_inplace_e.drop("e", axis=1, inplace=True) + for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e): + assert obj.index.name == "first" + assert obj.columns.name == "second" + assert list(df.columns) == ["d", "e", "f"] + + msg = r"\['g'\] not found in axis" + with pytest.raises(KeyError, match=msg): + df.drop(["g"]) + with pytest.raises(KeyError, match=msg): + df.drop(["g"], 1) + + # errors = 'ignore' + dropped = df.drop(["g"], errors="ignore") + expected = Index(["a", "b", "c"], name="first") + tm.assert_index_equal(dropped.index, expected) + + dropped = df.drop(["b", "g"], errors="ignore") + expected = Index(["a", "c"], name="first") + tm.assert_index_equal(dropped.index, expected) + + dropped = df.drop(["g"], axis=1, errors="ignore") + expected = Index(["d", "e", "f"], name="second") + tm.assert_index_equal(dropped.columns, expected) + + dropped = df.drop(["d", "g"], axis=1, errors="ignore") + expected = Index(["e", "f"], name="second") + tm.assert_index_equal(dropped.columns, expected) + + # GH 16398 + dropped = df.drop([], errors="ignore") + expected = Index(["a", "b", "c"], name="first") + tm.assert_index_equal(dropped.index, expected) + + def test_drop(self): + simple = DataFrame({"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]}) + tm.assert_frame_equal(simple.drop("A", axis=1), simple[["B"]]) + tm.assert_frame_equal(simple.drop(["A", "B"], axis="columns"), simple[[]]) + tm.assert_frame_equal(simple.drop([0, 1, 3], axis=0), simple.loc[[2], :]) + tm.assert_frame_equal(simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :]) + + with pytest.raises(KeyError, match=r"\[5\] not found in axis"): + simple.drop(5) + with pytest.raises(KeyError, match=r"\['C'\] not found in axis"): + simple.drop("C", 1) + with pytest.raises(KeyError, match=r"\[5\] not found in axis"): + simple.drop([1, 5]) + with pytest.raises(KeyError, match=r"\['C'\] not found in axis"): + simple.drop(["A", "C"], 1) + + # errors = 'ignore' + tm.assert_frame_equal(simple.drop(5, errors="ignore"), simple) + tm.assert_frame_equal( + simple.drop([0, 5], errors="ignore"), simple.loc[[1, 2, 3], :] + ) + tm.assert_frame_equal(simple.drop("C", axis=1, errors="ignore"), simple) + tm.assert_frame_equal( + simple.drop(["A", "C"], axis=1, errors="ignore"), simple[["B"]] + ) + + # non-unique - wheee! + nu_df = DataFrame( + list(zip(range(3), range(-3, 1), list("abc"))), columns=["a", "a", "b"] + ) + tm.assert_frame_equal(nu_df.drop("a", axis=1), nu_df[["b"]]) + tm.assert_frame_equal(nu_df.drop("b", axis="columns"), nu_df["a"]) + tm.assert_frame_equal(nu_df.drop([]), nu_df) # GH 16398 + + nu_df = nu_df.set_index(pd.Index(["X", "Y", "X"])) + nu_df.columns = list("abc") + tm.assert_frame_equal(nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :]) + tm.assert_frame_equal(nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :]) + + # inplace cache issue + # GH#5628 + df = pd.DataFrame(np.random.randn(10, 3), columns=list("abc")) + expected = df[~(df.b > 0)] + df.drop(labels=df[df.b > 0].index, inplace=True) + tm.assert_frame_equal(df, expected) + + def test_drop_multiindex_not_lexsorted(self): + # GH#11640 + + # define the lexsorted version + lexsorted_mi = MultiIndex.from_tuples( + [("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"] + ) + lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) + assert lexsorted_df.columns.is_lexsorted() + + # define the non-lexsorted version + not_lexsorted_df = DataFrame( + columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]] + ) + not_lexsorted_df = not_lexsorted_df.pivot_table( + index="a", columns=["b", "c"], values="d" + ) + not_lexsorted_df = not_lexsorted_df.reset_index() + assert not not_lexsorted_df.columns.is_lexsorted() + + # compare the results + tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) + + expected = lexsorted_df.drop("a", axis=1) + with tm.assert_produces_warning(PerformanceWarning): + result = not_lexsorted_df.drop("a", axis=1) + + tm.assert_frame_equal(result, expected) + + def test_drop_api_equivalence(self): + # equivalence of the labels/axis and index/columns API's (GH#12392) + df = DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7]], + index=["a", "b", "c"], + columns=["d", "e", "f"], + ) + + res1 = df.drop("a") + res2 = df.drop(index="a") + tm.assert_frame_equal(res1, res2) + + res1 = df.drop("d", 1) + res2 = df.drop(columns="d") + tm.assert_frame_equal(res1, res2) + + res1 = df.drop(labels="e", axis=1) + res2 = df.drop(columns="e") + tm.assert_frame_equal(res1, res2) + + res1 = df.drop(["a"], axis=0) + res2 = df.drop(index=["a"]) + tm.assert_frame_equal(res1, res2) + + res1 = df.drop(["a"], axis=0).drop(["d"], axis=1) + res2 = df.drop(index=["a"], columns=["d"]) + tm.assert_frame_equal(res1, res2) + + msg = "Cannot specify both 'labels' and 'index'/'columns'" + with pytest.raises(ValueError, match=msg): + df.drop(labels="a", index="b") + + with pytest.raises(ValueError, match=msg): + df.drop(labels="a", columns="b") + + msg = "Need to specify at least one of 'labels', 'index' or 'columns'" + with pytest.raises(ValueError, match=msg): + df.drop(axis=1) + + data = [[1, 2, 3], [1, 2, 3]] + + @pytest.mark.parametrize( + "actual", + [ + DataFrame(data=data, index=["a", "a"]), + DataFrame(data=data, index=["a", "b"]), + DataFrame(data=data, index=["a", "b"]).set_index([0, 1]), + DataFrame(data=data, index=["a", "a"]).set_index([0, 1]), + ], + ) + def test_raise_on_drop_duplicate_index(self, actual): + + # GH#19186 + level = 0 if isinstance(actual.index, MultiIndex) else None + msg = re.escape("\"['c'] not found in axis\"") + with pytest.raises(KeyError, match=msg): + actual.drop("c", level=level, axis=0) + with pytest.raises(KeyError, match=msg): + actual.T.drop("c", level=level, axis=1) + expected_no_err = actual.drop("c", axis=0, level=level, errors="ignore") + tm.assert_frame_equal(expected_no_err, actual) + expected_no_err = actual.T.drop("c", axis=1, level=level, errors="ignore") + tm.assert_frame_equal(expected_no_err.T, actual) + + @pytest.mark.parametrize("index", [[1, 2, 3], [1, 1, 2]]) + @pytest.mark.parametrize("drop_labels", [[], [1], [2]]) + def test_drop_empty_list(self, index, drop_labels): + # GH#21494 + expected_index = [i for i in index if i not in drop_labels] + frame = pd.DataFrame(index=index).drop(drop_labels) + tm.assert_frame_equal(frame, pd.DataFrame(index=expected_index)) + + @pytest.mark.parametrize("index", [[1, 2, 3], [1, 2, 2]]) + @pytest.mark.parametrize("drop_labels", [[1, 4], [4, 5]]) + def test_drop_non_empty_list(self, index, drop_labels): + # GH# 21494 + with pytest.raises(KeyError, match="not found in axis"): + pd.DataFrame(index=index).drop(drop_labels) + + def test_mixed_depth_drop(self): + arrays = [ + ["a", "top", "top", "routine1", "routine1", "routine2"], + ["", "OD", "OD", "result1", "result2", "result1"], + ["", "wx", "wy", "", "", ""], + ] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + df = DataFrame(np.random.randn(4, 6), columns=index) + + result = df.drop("a", axis=1) + expected = df.drop([("a", "", "")], axis=1) + tm.assert_frame_equal(expected, result) + + result = df.drop(["top"], axis=1) + expected = df.drop([("top", "OD", "wx")], axis=1) + expected = expected.drop([("top", "OD", "wy")], axis=1) + tm.assert_frame_equal(expected, result) + + result = df.drop(("top", "OD", "wx"), axis=1) + expected = df.drop([("top", "OD", "wx")], axis=1) + tm.assert_frame_equal(expected, result) + + expected = df.drop([("top", "OD", "wy")], axis=1) + expected = df.drop("top", axis=1) + + result = df.drop("result1", level=1, axis=1) + expected = df.drop( + [("routine1", "result1", ""), ("routine2", "result1", "")], axis=1 + ) + tm.assert_frame_equal(expected, result) + + def test_drop_multiindex_other_level_nan(self): + # GH#12754 + df = ( + DataFrame( + { + "A": ["one", "one", "two", "two"], + "B": [np.nan, 0.0, 1.0, 2.0], + "C": ["a", "b", "c", "c"], + "D": [1, 2, 3, 4], + } + ) + .set_index(["A", "B", "C"]) + .sort_index() + ) + result = df.drop("c", level="C") + expected = DataFrame( + [2, 1], + columns=["D"], + index=pd.MultiIndex.from_tuples( + [("one", 0.0, "b"), ("one", np.nan, "a")], names=["A", "B", "C"] + ), + ) + tm.assert_frame_equal(result, expected) + + def test_drop_nonunique(self): + df = DataFrame( + [ + ["x-a", "x", "a", 1.5], + ["x-a", "x", "a", 1.2], + ["z-c", "z", "c", 3.1], + ["x-a", "x", "a", 4.1], + ["x-b", "x", "b", 5.1], + ["x-b", "x", "b", 4.1], + ["x-b", "x", "b", 2.2], + ["y-a", "y", "a", 1.2], + ["z-b", "z", "b", 2.1], + ], + columns=["var1", "var2", "var3", "var4"], + ) + + grp_size = df.groupby("var1").size() + drop_idx = grp_size.loc[grp_size == 1] + + idf = df.set_index(["var1", "var2", "var3"]) + + # it works! GH#2101 + result = idf.drop(drop_idx.index, level=0).reset_index() + expected = df[-df.var1.isin(drop_idx.index)] + + result.index = expected.index + + tm.assert_frame_equal(result, expected) + + def test_drop_level(self): + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + frame = DataFrame( + np.random.randn(10, 3), + index=index, + columns=Index(["A", "B", "C"], name="exp"), + ) + + result = frame.drop(["bar", "qux"], level="first") + expected = frame.iloc[[0, 1, 2, 5, 6]] + tm.assert_frame_equal(result, expected) + + result = frame.drop(["two"], level="second") + expected = frame.iloc[[0, 2, 3, 6, 7, 9]] + tm.assert_frame_equal(result, expected) + + result = frame.T.drop(["bar", "qux"], axis=1, level="first") + expected = frame.iloc[[0, 1, 2, 5, 6]].T + tm.assert_frame_equal(result, expected) + + result = frame.T.drop(["two"], axis=1, level="second") + expected = frame.iloc[[0, 2, 3, 6, 7, 9]].T + tm.assert_frame_equal(result, expected) + + def test_drop_level_nonunique_datetime(self): + # GH#12701 + idx = Index([2, 3, 4, 4, 5], name="id") + idxdt = pd.to_datetime( + [ + "201603231400", + "201603231500", + "201603231600", + "201603231600", + "201603231700", + ] + ) + df = DataFrame(np.arange(10).reshape(5, 2), columns=list("ab"), index=idx) + df["tstamp"] = idxdt + df = df.set_index("tstamp", append=True) + ts = Timestamp("201603231600") + assert df.index.is_unique is False + + result = df.drop(ts, level="tstamp") + expected = df.loc[idx != 4] + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("box", [Series, DataFrame]) + def test_drop_tz_aware_timestamp_across_dst(self, box): + # GH#21761 + start = Timestamp("2017-10-29", tz="Europe/Berlin") + end = Timestamp("2017-10-29 04:00:00", tz="Europe/Berlin") + index = pd.date_range(start, end, freq="15min") + data = box(data=[1] * len(index), index=index) + result = data.drop(start) + expected_start = Timestamp("2017-10-29 00:15:00", tz="Europe/Berlin") + expected_idx = pd.date_range(expected_start, end, freq="15min") + expected = box(data=[1] * len(expected_idx), index=expected_idx) + tm.assert_equal(result, expected) + + def test_drop_preserve_names(self): + index = MultiIndex.from_arrays( + [[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]], names=["one", "two"] + ) + + df = DataFrame(np.random.randn(6, 3), index=index) + + result = df.drop([(0, 2)]) + assert result.index.names == ("one", "two") diff --git a/pandas/tests/frame/methods/test_droplevel.py b/pandas/tests/frame/methods/test_droplevel.py new file mode 100644 index 0000000000000..517905cf23259 --- /dev/null +++ b/pandas/tests/frame/methods/test_droplevel.py @@ -0,0 +1,23 @@ +from pandas import DataFrame, Index, MultiIndex +import pandas._testing as tm + + +class TestDropLevel: + def test_droplevel(self): + # GH#20342 + df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) + df = df.set_index([0, 1]).rename_axis(["a", "b"]) + df.columns = MultiIndex.from_tuples( + [("c", "e"), ("d", "f")], names=["level_1", "level_2"] + ) + + # test that dropping of a level in index works + expected = df.reset_index("a", drop=True) + result = df.droplevel("a", axis="index") + tm.assert_frame_equal(result, expected) + + # test that dropping of a level in columns works + expected = df.copy() + expected.columns = Index(["c", "d"], name="level_1") + result = df.droplevel("level_2", axis="columns") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_duplicated.py b/pandas/tests/frame/methods/test_duplicated.py index 38b9d7fd049ab..82fd6d88b82b9 100644 --- a/pandas/tests/frame/methods/test_duplicated.py +++ b/pandas/tests/frame/methods/test_duplicated.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas import DataFrame, Series +from pandas import DataFrame, Series, date_range import pandas._testing as tm @@ -64,7 +64,6 @@ def test_duplicated_nan_none(keep, expected): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("keep", ["first", "last", False]) @pytest.mark.parametrize("subset", [None, ["A", "B"], "A"]) def test_duplicated_subset(subset, keep): df = DataFrame( @@ -96,3 +95,15 @@ def test_duplicated_on_empty_frame(): result = df[dupes] expected = df.copy() tm.assert_frame_equal(result, expected) + + +def test_frame_datetime64_duplicated(): + dates = date_range("2010-07-01", end="2010-08-05") + + tst = DataFrame({"symbol": "AAA", "date": dates}) + result = tst.duplicated(["date", "symbol"]) + assert (-result).all() + + tst = DataFrame({"date": dates}) + result = tst.duplicated() + assert (-result).all() diff --git a/pandas/tests/frame/methods/test_filter.py b/pandas/tests/frame/methods/test_filter.py new file mode 100644 index 0000000000000..569b2fe21d1c2 --- /dev/null +++ b/pandas/tests/frame/methods/test_filter.py @@ -0,0 +1,139 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm + + +class TestDataFrameFilter: + def test_filter(self, float_frame, float_string_frame): + # Items + filtered = float_frame.filter(["A", "B", "E"]) + assert len(filtered.columns) == 2 + assert "E" not in filtered + + filtered = float_frame.filter(["A", "B", "E"], axis="columns") + assert len(filtered.columns) == 2 + assert "E" not in filtered + + # Other axis + idx = float_frame.index[0:4] + filtered = float_frame.filter(idx, axis="index") + expected = float_frame.reindex(index=idx) + tm.assert_frame_equal(filtered, expected) + + # like + fcopy = float_frame.copy() + fcopy["AA"] = 1 + + filtered = fcopy.filter(like="A") + assert len(filtered.columns) == 2 + assert "AA" in filtered + + # like with ints in column names + df = DataFrame(0.0, index=[0, 1, 2], columns=[0, 1, "_A", "_B"]) + filtered = df.filter(like="_") + assert len(filtered.columns) == 2 + + # regex with ints in column names + # from PR #10384 + df = DataFrame(0.0, index=[0, 1, 2], columns=["A1", 1, "B", 2, "C"]) + expected = DataFrame( + 0.0, index=[0, 1, 2], columns=pd.Index([1, 2], dtype=object) + ) + filtered = df.filter(regex="^[0-9]+$") + tm.assert_frame_equal(filtered, expected) + + expected = DataFrame(0.0, index=[0, 1, 2], columns=[0, "0", 1, "1"]) + # shouldn't remove anything + filtered = expected.filter(regex="^[0-9]+$") + tm.assert_frame_equal(filtered, expected) + + # pass in None + with pytest.raises(TypeError, match="Must pass"): + float_frame.filter() + with pytest.raises(TypeError, match="Must pass"): + float_frame.filter(items=None) + with pytest.raises(TypeError, match="Must pass"): + float_frame.filter(axis=1) + + # test mutually exclusive arguments + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], regex="e$", like="bbi") + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], regex="e$", axis=1) + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], regex="e$") + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], like="bbi", axis=0) + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], like="bbi") + + # objects + filtered = float_string_frame.filter(like="foo") + assert "foo" in filtered + + # unicode columns, won't ascii-encode + df = float_frame.rename(columns={"B": "\u2202"}) + filtered = df.filter(like="C") + assert "C" in filtered + + def test_filter_regex_search(self, float_frame): + fcopy = float_frame.copy() + fcopy["AA"] = 1 + + # regex + filtered = fcopy.filter(regex="[A]+") + assert len(filtered.columns) == 2 + assert "AA" in filtered + + # doesn't have to be at beginning + df = DataFrame( + {"aBBa": [1, 2], "BBaBB": [1, 2], "aCCa": [1, 2], "aCCaBB": [1, 2]} + ) + + result = df.filter(regex="BB") + exp = df[[x for x in df.columns if "BB" in x]] + tm.assert_frame_equal(result, exp) + + @pytest.mark.parametrize( + "name,expected", + [ + ("a", DataFrame({"a": [1, 2]})), + ("a", DataFrame({"a": [1, 2]})), + ("あ", DataFrame({"あ": [3, 4]})), + ], + ) + def test_filter_unicode(self, name, expected): + # GH13101 + df = DataFrame({"a": [1, 2], "あ": [3, 4]}) + + tm.assert_frame_equal(df.filter(like=name), expected) + tm.assert_frame_equal(df.filter(regex=name), expected) + + @pytest.mark.parametrize("name", ["a", "a"]) + def test_filter_bytestring(self, name): + # GH13101 + df = DataFrame({b"a": [1, 2], b"b": [3, 4]}) + expected = DataFrame({b"a": [1, 2]}) + + tm.assert_frame_equal(df.filter(like=name), expected) + tm.assert_frame_equal(df.filter(regex=name), expected) + + def test_filter_corner(self): + empty = DataFrame() + + result = empty.filter([]) + tm.assert_frame_equal(result, empty) + + result = empty.filter(like="foo") + tm.assert_frame_equal(result, empty) + + def test_filter_regex_non_string(self): + # GH#5798 trying to filter on non-string columns should drop, + # not raise + df = pd.DataFrame(np.random.random((3, 2)), columns=["STRING", 123]) + result = df.filter(regex="STRING") + expected = df[["STRING"]] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_first_and_last.py b/pandas/tests/frame/methods/test_first_and_last.py new file mode 100644 index 0000000000000..73e4128ddebb9 --- /dev/null +++ b/pandas/tests/frame/methods/test_first_and_last.py @@ -0,0 +1,61 @@ +""" +Note: includes tests for `last` +""" +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +class TestFirst: + def test_first_subset(self): + ts = tm.makeTimeDataFrame(freq="12h") + result = ts.first("10d") + assert len(result) == 20 + + ts = tm.makeTimeDataFrame(freq="D") + result = ts.first("10d") + assert len(result) == 10 + + result = ts.first("3M") + expected = ts[:"3/31/2000"] + tm.assert_frame_equal(result, expected) + + result = ts.first("21D") + expected = ts[:21] + tm.assert_frame_equal(result, expected) + + result = ts[:0].first("3M") + tm.assert_frame_equal(result, ts[:0]) + + def test_first_raises(self): + # GH#20725 + df = DataFrame([[1, 2, 3], [4, 5, 6]]) + with pytest.raises(TypeError): # index is not a DatetimeIndex + df.first("1D") + + def test_last_subset(self): + ts = tm.makeTimeDataFrame(freq="12h") + result = ts.last("10d") + assert len(result) == 20 + + ts = tm.makeTimeDataFrame(nper=30, freq="D") + result = ts.last("10d") + assert len(result) == 10 + + result = ts.last("21D") + expected = ts["2000-01-10":] + tm.assert_frame_equal(result, expected) + + result = ts.last("21D") + expected = ts[-21:] + tm.assert_frame_equal(result, expected) + + result = ts[:0].last("3M") + tm.assert_frame_equal(result, ts[:0]) + + def test_last_raises(self): + # GH20725 + df = DataFrame([[1, 2, 3], [4, 5, 6]]) + with pytest.raises(TypeError): # index is not a DatetimeIndex + df.last("1D") diff --git a/pandas/tests/frame/methods/test_pop.py b/pandas/tests/frame/methods/test_pop.py new file mode 100644 index 0000000000000..fccb3f10dde45 --- /dev/null +++ b/pandas/tests/frame/methods/test_pop.py @@ -0,0 +1,40 @@ +from pandas import DataFrame, Series +import pandas._testing as tm + + +class TestDataFramePop: + def test_pop(self, float_frame): + float_frame.columns.name = "baz" + + float_frame.pop("A") + assert "A" not in float_frame + + float_frame["foo"] = "bar" + float_frame.pop("foo") + assert "foo" not in float_frame + assert float_frame.columns.name == "baz" + + # gh-10912: inplace ops cause caching issue + a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"], index=["X", "Y"]) + b = a.pop("B") + b += 1 + + # original frame + expected = DataFrame([[1, 3], [4, 6]], columns=["A", "C"], index=["X", "Y"]) + tm.assert_frame_equal(a, expected) + + # result + expected = Series([2, 5], index=["X", "Y"], name="B") + 1 + tm.assert_series_equal(b, expected) + + def test_pop_non_unique_cols(self): + df = DataFrame({0: [0, 1], 1: [0, 1], 2: [4, 5]}) + df.columns = ["a", "b", "a"] + + res = df.pop("a") + assert type(res) == DataFrame + assert len(res) == 2 + assert len(df.columns) == 1 + assert "b" in df.columns + assert "a" not in df.columns + assert len(df.index) == 2 diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 9c52e8ec5620f..0eec30cbc5c67 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -100,13 +100,10 @@ def test_quantile_axis_parameter(self): result = df.quantile(0.5, axis="columns") tm.assert_series_equal(result, expected) - msg = "No axis named -1 for object type " + msg = "No axis named -1 for object type DataFrame" with pytest.raises(ValueError, match=msg): df.quantile(0.1, axis=-1) - msg = ( - "No axis named column for object type " - "" - ) + msg = "No axis named column for object type DataFrame" with pytest.raises(ValueError, match=msg): df.quantile(0.1, axis="column") diff --git a/pandas/tests/frame/methods/test_reindex_like.py b/pandas/tests/frame/methods/test_reindex_like.py new file mode 100644 index 0000000000000..ce68ec28eec3d --- /dev/null +++ b/pandas/tests/frame/methods/test_reindex_like.py @@ -0,0 +1,39 @@ +import numpy as np +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +class TestDataFrameReindexLike: + def test_reindex_like(self, float_frame): + other = float_frame.reindex(index=float_frame.index[:10], columns=["C", "B"]) + + tm.assert_frame_equal(other, float_frame.reindex_like(other)) + + @pytest.mark.parametrize( + "method,expected_values", + [ + ("nearest", [0, 1, 1, 2]), + ("pad", [np.nan, 0, 1, 1]), + ("backfill", [0, 1, 2, 2]), + ], + ) + def test_reindex_like_methods(self, method, expected_values): + df = DataFrame({"x": list(range(5))}) + + result = df.reindex_like(df, method=method, tolerance=0) + tm.assert_frame_equal(df, result) + result = df.reindex_like(df, method=method, tolerance=[0, 0, 0, 0]) + tm.assert_frame_equal(df, result) + + def test_reindex_like_subclass(self): + # https://github.com/pandas-dev/pandas/issues/31925 + class MyDataFrame(DataFrame): + pass + + expected = DataFrame() + df = MyDataFrame() + result = df.reindex_like(expected) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_rename.py b/pandas/tests/frame/methods/test_rename.py index e69a562f8214d..ffad526d3f4d1 100644 --- a/pandas/tests/frame/methods/test_rename.py +++ b/pandas/tests/frame/methods/test_rename.py @@ -67,30 +67,6 @@ def test_rename_chainmap(self, args, kwargs): expected = DataFrame({"a": colAData, "b": colBdata}) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "kwargs, rename_index, rename_columns", - [ - ({"mapper": None, "axis": 0}, True, False), - ({"mapper": None, "axis": 1}, False, True), - ({"index": None}, True, False), - ({"columns": None}, False, True), - ({"index": None, "columns": None}, True, True), - ({}, False, False), - ], - ) - def test_rename_axis_none(self, kwargs, rename_index, rename_columns): - # GH 25034 - index = Index(list("abc"), name="foo") - columns = Index(["col1", "col2"], name="bar") - data = np.arange(6).reshape(3, 2) - df = DataFrame(data, index, columns) - - result = df.rename_axis(**kwargs) - expected_index = index.rename(None) if rename_index else index - expected_columns = columns.rename(None) if rename_columns else columns - expected = DataFrame(data, expected_index, expected_columns) - tm.assert_frame_equal(result, expected) - def test_rename_multiindex(self): tuples_index = [("foo1", "bar1"), ("foo2", "bar2")] diff --git a/pandas/tests/frame/methods/test_rename_axis.py b/pandas/tests/frame/methods/test_rename_axis.py new file mode 100644 index 0000000000000..9b964d842526c --- /dev/null +++ b/pandas/tests/frame/methods/test_rename_axis.py @@ -0,0 +1,105 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Index, MultiIndex +import pandas._testing as tm + + +class TestDataFrameRenameAxis: + def test_rename_axis_inplace(self, float_frame): + # GH#15704 + expected = float_frame.rename_axis("foo") + result = float_frame.copy() + no_return = result.rename_axis("foo", inplace=True) + + assert no_return is None + tm.assert_frame_equal(result, expected) + + expected = float_frame.rename_axis("bar", axis=1) + result = float_frame.copy() + no_return = result.rename_axis("bar", axis=1, inplace=True) + + assert no_return is None + tm.assert_frame_equal(result, expected) + + def test_rename_axis_raises(self): + # GH#17833 + df = DataFrame({"A": [1, 2], "B": [1, 2]}) + with pytest.raises(ValueError, match="Use `.rename`"): + df.rename_axis(id, axis=0) + + with pytest.raises(ValueError, match="Use `.rename`"): + df.rename_axis({0: 10, 1: 20}, axis=0) + + with pytest.raises(ValueError, match="Use `.rename`"): + df.rename_axis(id, axis=1) + + with pytest.raises(ValueError, match="Use `.rename`"): + df["A"].rename_axis(id) + + def test_rename_axis_mapper(self): + # GH#19978 + mi = MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["ll", "nn"]) + df = DataFrame( + {"x": list(range(len(mi))), "y": [i * 10 for i in range(len(mi))]}, index=mi + ) + + # Test for rename of the Index object of columns + result = df.rename_axis("cols", axis=1) + tm.assert_index_equal(result.columns, Index(["x", "y"], name="cols")) + + # Test for rename of the Index object of columns using dict + result = result.rename_axis(columns={"cols": "new"}, axis=1) + tm.assert_index_equal(result.columns, Index(["x", "y"], name="new")) + + # Test for renaming index using dict + result = df.rename_axis(index={"ll": "foo"}) + assert result.index.names == ["foo", "nn"] + + # Test for renaming index using a function + result = df.rename_axis(index=str.upper, axis=0) + assert result.index.names == ["LL", "NN"] + + # Test for renaming index providing complete list + result = df.rename_axis(index=["foo", "goo"]) + assert result.index.names == ["foo", "goo"] + + # Test for changing index and columns at same time + sdf = df.reset_index().set_index("nn").drop(columns=["ll", "y"]) + result = sdf.rename_axis(index="foo", columns="meh") + assert result.index.name == "foo" + assert result.columns.name == "meh" + + # Test different error cases + with pytest.raises(TypeError, match="Must pass"): + df.rename_axis(index="wrong") + + with pytest.raises(ValueError, match="Length of names"): + df.rename_axis(index=["wrong"]) + + with pytest.raises(TypeError, match="bogus"): + df.rename_axis(bogus=None) + + @pytest.mark.parametrize( + "kwargs, rename_index, rename_columns", + [ + ({"mapper": None, "axis": 0}, True, False), + ({"mapper": None, "axis": 1}, False, True), + ({"index": None}, True, False), + ({"columns": None}, False, True), + ({"index": None, "columns": None}, True, True), + ({}, False, False), + ], + ) + def test_rename_axis_none(self, kwargs, rename_index, rename_columns): + # GH 25034 + index = Index(list("abc"), name="foo") + columns = Index(["col1", "col2"], name="bar") + data = np.arange(6).reshape(3, 2) + df = DataFrame(data, index, columns) + + result = df.rename_axis(**kwargs) + expected_index = index.rename(None) if rename_index else index + expected_columns = columns.rename(None) if rename_columns else columns + expected = DataFrame(data, expected_index, expected_columns) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 92b74c4409d7d..a9fb686d5bc50 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1303,9 +1303,15 @@ def test_replace_method(self, to_replace, method, expected): def test_categorical_replace_with_dict(self, replace_dict, final_data): # GH 26988 df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category") - expected = DataFrame(final_data, columns=["a", "b"], dtype="category") - expected["a"] = expected["a"].cat.set_categories([1, 2, 3]) - expected["b"] = expected["b"].cat.set_categories([1, 2, 3]) + + final_data = np.array(final_data) + + a = pd.Categorical(final_data[:, 0], categories=[3, 2]) + + excat = [3, 2] if replace_dict["b"] == 1 else [1, 3] + b = pd.Categorical(final_data[:, 1], categories=excat) + + expected = DataFrame({"a": a, "b": b}) result = df.replace(replace_dict, 3) tm.assert_frame_equal(result, expected) with pytest.raises(AssertionError): @@ -1363,3 +1369,14 @@ def test_replace_after_convert_dtypes(self): result = df.replace(1, 10) expected = pd.DataFrame({"grp": [10, 2, 3, 4, 5]}, dtype="Int64") tm.assert_frame_equal(result, expected) + + def test_replace_invalid_to_replace(self): + # GH 18634 + # API: replace() should raise an exception if invalid argument is given + df = pd.DataFrame({"one": ["a", "b ", "c"], "two": ["d ", "e ", "f "]}) + msg = ( + r"Expecting 'to_replace' to be either a scalar, array-like, " + r"dict or None, got invalid type.*" + ) + with pytest.raises(TypeError, match=msg): + df.replace(lambda x: x.strip()) diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index cfb17de892b1c..f6c89172bbf86 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -185,3 +185,26 @@ def test_tshift(self, datetime_frame): msg = "Freq was not given and was not set in the index" with pytest.raises(ValueError, match=msg): no_freq.tshift() + + def test_shift_dt64values_int_fill_deprecated(self): + # GH#31971 + ser = pd.Series([pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02")]) + df = ser.to_frame() + + with tm.assert_produces_warning(FutureWarning): + result = df.shift(1, fill_value=0) + + expected = pd.Series([pd.Timestamp(0), ser[0]]).to_frame() + tm.assert_frame_equal(result, expected) + + # axis = 1 + df2 = pd.DataFrame({"A": ser, "B": ser}) + df2._consolidate_inplace() + + with tm.assert_produces_warning(FutureWarning): + result = df2.shift(1, axis=1, fill_value=0) + + expected = pd.DataFrame( + {"A": [pd.Timestamp(0), pd.Timestamp(0)], "B": df2["A"]} + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index 5a25d1c2c0894..3d3bb98f80ac5 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -43,7 +43,7 @@ def test_sort_values(self): sorted_df = frame.sort_values(by=["B", "A"], ascending=[True, False]) tm.assert_frame_equal(sorted_df, expected) - msg = "No axis named 2 for object type " + msg = "No axis named 2 for object type DataFrame" with pytest.raises(ValueError, match=msg): frame.sort_values(by=["A", "B"], axis=2, inplace=True) diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py index cd9bd169322fd..f1656b46cf356 100644 --- a/pandas/tests/frame/methods/test_to_dict.py +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -70,8 +70,17 @@ def test_to_dict_invalid_orient(self): with pytest.raises(ValueError, match=msg): df.to_dict(orient="xinvalid") + @pytest.mark.parametrize("orient", ["d", "l", "r", "sp", "s", "i"]) + def test_to_dict_short_orient_warns(self, orient): + # GH#32515 + df = DataFrame({"A": [0, 1]}) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + df.to_dict(orient=orient) + @pytest.mark.parametrize("mapping", [dict, defaultdict(list), OrderedDict]) def test_to_dict(self, mapping): + # orient= should only take the listed options + # see GH#32515 test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}} # GH#16122 @@ -81,19 +90,19 @@ def test_to_dict(self, mapping): for k2, v2 in v.items(): assert v2 == recons_data[k][k2] - recons_data = DataFrame(test_data).to_dict("l", mapping) + recons_data = DataFrame(test_data).to_dict("list", mapping) for k, v in test_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k][int(k2) - 1] - recons_data = DataFrame(test_data).to_dict("s", mapping) + recons_data = DataFrame(test_data).to_dict("series", mapping) for k, v in test_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k][k2] - recons_data = DataFrame(test_data).to_dict("sp", mapping) + recons_data = DataFrame(test_data).to_dict("split", mapping) expected_split = { "columns": ["A", "B"], "index": ["1", "2", "3"], @@ -101,7 +110,7 @@ def test_to_dict(self, mapping): } tm.assert_dict_equal(recons_data, expected_split) - recons_data = DataFrame(test_data).to_dict("r", mapping) + recons_data = DataFrame(test_data).to_dict("records", mapping) expected_records = [ {"A": 1.0, "B": "1"}, {"A": 2.0, "B": "2"}, @@ -113,7 +122,7 @@ def test_to_dict(self, mapping): tm.assert_dict_equal(l, r) # GH#10844 - recons_data = DataFrame(test_data).to_dict("i") + recons_data = DataFrame(test_data).to_dict("index") for k, v in test_data.items(): for k2, v2 in v.items(): @@ -121,7 +130,7 @@ def test_to_dict(self, mapping): df = DataFrame(test_data) df["duped"] = df[df.columns[0]] - recons_data = df.to_dict("i") + recons_data = df.to_dict("index") comp_data = test_data.copy() comp_data["duped"] = comp_data[df.columns[0]] for k, v in comp_data.items(): diff --git a/pandas/tests/frame/methods/test_to_period.py b/pandas/tests/frame/methods/test_to_period.py index eac78e611b008..051461b6c554d 100644 --- a/pandas/tests/frame/methods/test_to_period.py +++ b/pandas/tests/frame/methods/test_to_period.py @@ -31,6 +31,6 @@ def test_frame_to_period(self): pts = df.to_period("M", axis=1) tm.assert_index_equal(pts.columns, exp.columns.asfreq("M")) - msg = "No axis named 2 for object type " + msg = "No axis named 2 for object type DataFrame" with pytest.raises(ValueError, match=msg): df.to_period(axis=2) diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py index d0181f0309af1..34b323e55d8cd 100644 --- a/pandas/tests/frame/methods/test_to_records.py +++ b/pandas/tests/frame/methods/test_to_records.py @@ -3,7 +3,14 @@ import numpy as np import pytest -from pandas import CategoricalDtype, DataFrame, MultiIndex, Series, date_range +from pandas import ( + CategoricalDtype, + DataFrame, + MultiIndex, + Series, + Timestamp, + date_range, +) import pandas._testing as tm @@ -18,6 +25,17 @@ def test_to_records_dt64(self): result = df.to_records()["index"][0] assert expected == result + def test_to_records_dt64tz_column(self): + # GH#32535 dont less tz in to_records + df = DataFrame({"A": date_range("2012-01-01", "2012-01-02", tz="US/Eastern")}) + + result = df.to_records() + + assert result.dtype["A"] == object + val = result[0][1] + assert isinstance(val, Timestamp) + assert val == df.loc[0, "A"] + def test_to_records_with_multindex(self): # GH#3189 index = [ diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 751ed1dfdd847..b28e8a5b347aa 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -234,9 +234,16 @@ def test_set_index_pass_arrays_duplicate( # need to adapt first drop for case that both keys are 'A' -- # cannot drop the same column twice; - # use "is" because == would give ambiguous Boolean error for containers + # plain == would give ambiguous Boolean error for containers first_drop = ( - False if (keys[0] is "A" and keys[1] is "A") else drop # noqa: F632 + False + if ( + isinstance(keys[0], str) + and keys[0] == "A" + and isinstance(keys[1], str) + and keys[1] == "A" + ) + else drop ) # to test against already-tested behaviour, we add sequentially, # hence second append always True; must wrap keys in list, otherwise @@ -615,118 +622,6 @@ def test_dti_set_index_reindex(self): # Renaming - def test_rename_axis_inplace(self, float_frame): - # GH 15704 - expected = float_frame.rename_axis("foo") - result = float_frame.copy() - no_return = result.rename_axis("foo", inplace=True) - - assert no_return is None - tm.assert_frame_equal(result, expected) - - expected = float_frame.rename_axis("bar", axis=1) - result = float_frame.copy() - no_return = result.rename_axis("bar", axis=1, inplace=True) - - assert no_return is None - tm.assert_frame_equal(result, expected) - - def test_rename_axis_raises(self): - # https://github.com/pandas-dev/pandas/issues/17833 - df = DataFrame({"A": [1, 2], "B": [1, 2]}) - with pytest.raises(ValueError, match="Use `.rename`"): - df.rename_axis(id, axis=0) - - with pytest.raises(ValueError, match="Use `.rename`"): - df.rename_axis({0: 10, 1: 20}, axis=0) - - with pytest.raises(ValueError, match="Use `.rename`"): - df.rename_axis(id, axis=1) - - with pytest.raises(ValueError, match="Use `.rename`"): - df["A"].rename_axis(id) - - def test_rename_axis_mapper(self): - # GH 19978 - mi = MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["ll", "nn"]) - df = DataFrame( - {"x": list(range(len(mi))), "y": [i * 10 for i in range(len(mi))]}, index=mi - ) - - # Test for rename of the Index object of columns - result = df.rename_axis("cols", axis=1) - tm.assert_index_equal(result.columns, Index(["x", "y"], name="cols")) - - # Test for rename of the Index object of columns using dict - result = result.rename_axis(columns={"cols": "new"}, axis=1) - tm.assert_index_equal(result.columns, Index(["x", "y"], name="new")) - - # Test for renaming index using dict - result = df.rename_axis(index={"ll": "foo"}) - assert result.index.names == ["foo", "nn"] - - # Test for renaming index using a function - result = df.rename_axis(index=str.upper, axis=0) - assert result.index.names == ["LL", "NN"] - - # Test for renaming index providing complete list - result = df.rename_axis(index=["foo", "goo"]) - assert result.index.names == ["foo", "goo"] - - # Test for changing index and columns at same time - sdf = df.reset_index().set_index("nn").drop(columns=["ll", "y"]) - result = sdf.rename_axis(index="foo", columns="meh") - assert result.index.name == "foo" - assert result.columns.name == "meh" - - # Test different error cases - with pytest.raises(TypeError, match="Must pass"): - df.rename_axis(index="wrong") - - with pytest.raises(ValueError, match="Length of names"): - df.rename_axis(index=["wrong"]) - - with pytest.raises(TypeError, match="bogus"): - df.rename_axis(bogus=None) - - def test_reorder_levels(self): - index = MultiIndex( - levels=[["bar"], ["one", "two", "three"], [0, 1]], - codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], - names=["L0", "L1", "L2"], - ) - df = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=index) - - # no change, position - result = df.reorder_levels([0, 1, 2]) - tm.assert_frame_equal(df, result) - - # no change, labels - result = df.reorder_levels(["L0", "L1", "L2"]) - tm.assert_frame_equal(df, result) - - # rotate, position - result = df.reorder_levels([1, 2, 0]) - e_idx = MultiIndex( - levels=[["one", "two", "three"], [0, 1], ["bar"]], - codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1], [0, 0, 0, 0, 0, 0]], - names=["L1", "L2", "L0"], - ) - expected = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=e_idx) - tm.assert_frame_equal(result, expected) - - result = df.reorder_levels([0, 0, 0]) - e_idx = MultiIndex( - levels=[["bar"], ["bar"], ["bar"]], - codes=[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]], - names=["L0", "L0", "L0"], - ) - expected = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=e_idx) - tm.assert_frame_equal(result, expected) - - result = df.reorder_levels(["L0", "L0", "L0"]) - tm.assert_frame_equal(result, expected) - def test_set_index_names(self): df = tm.makeDataFrame() df.index.name = "name" @@ -840,25 +735,6 @@ def test_reindex_signature(self): "tolerance", } - def test_droplevel(self): - # GH20342 - df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) - df = df.set_index([0, 1]).rename_axis(["a", "b"]) - df.columns = MultiIndex.from_tuples( - [("c", "e"), ("d", "f")], names=["level_1", "level_2"] - ) - - # test that dropping of a level in index works - expected = df.reset_index("a", drop=True) - result = df.droplevel("a", axis="index") - tm.assert_frame_equal(result, expected) - - # test that dropping of a level in columns works - expected = df.copy() - expected.columns = Index(["c", "d"], name="level_1") - result = df.droplevel("level_2", axis="columns") - tm.assert_frame_equal(result, expected) - class TestIntervalIndex: def test_setitem(self): @@ -907,35 +783,3 @@ def test_set_reset_index(self): df = df.set_index("B") df = df.reset_index() - - def test_set_axis_inplace(self): - # GH14636 - df = DataFrame( - {"A": [1.1, 2.2, 3.3], "B": [5.0, 6.1, 7.2], "C": [4.4, 5.5, 6.6]}, - index=[2010, 2011, 2012], - ) - - expected = {0: df.copy(), 1: df.copy()} - expected[0].index = list("abc") - expected[1].columns = list("abc") - expected["index"] = expected[0] - expected["columns"] = expected[1] - - for axis in expected: - result = df.copy() - result.set_axis(list("abc"), axis=axis, inplace=True) - tm.assert_frame_equal(result, expected[axis]) - - # inplace=False - result = df.set_axis(list("abc"), axis=axis) - tm.assert_frame_equal(expected[axis], result) - - # omitting the "axis" parameter - with tm.assert_produces_warning(None): - result = df.set_axis(list("abc")) - tm.assert_frame_equal(result, expected[0]) - - # wrong values for the "axis" parameter - for axis in 3, "foo": - with pytest.raises(ValueError, match="No axis named"): - df.set_axis(list("abc"), axis=axis) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 61802956addeb..0255759513e28 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -331,6 +331,8 @@ def kurt(x): check_dates=True, ) + # GH#32571 check_less_precise is needed on apparently-random + # py37-npdev builds and OSX-PY36-min_version builds # mixed types (with upcasting happening) assert_stat_op_calc( "sum", @@ -344,7 +346,9 @@ def kurt(x): "sum", np.sum, float_frame_with_na, skipna_alternative=np.nansum ) assert_stat_op_calc("mean", np.mean, float_frame_with_na, check_dates=True) - assert_stat_op_calc("product", np.prod, float_frame_with_na) + assert_stat_op_calc( + "product", np.prod, float_frame_with_na, skipna_alternative=np.nanprod + ) assert_stat_op_calc("mad", mad, float_frame_with_na) assert_stat_op_calc("var", var, float_frame_with_na) @@ -875,11 +879,6 @@ def test_mean_datetimelike(self): expected = pd.Series({"A": 1.0, "C": df.loc[1, "C"]}) tm.assert_series_equal(result, expected) - @pytest.mark.xfail( - reason="casts to object-dtype and then tries to add timestamps", - raises=TypeError, - strict=True, - ) def test_mean_datetimelike_numeric_only_false(self): df = pd.DataFrame( { @@ -913,8 +912,8 @@ def test_sum_bools(self): def test_idxmin(self, float_frame, int_frame): frame = float_frame - frame.loc[5:10] = np.nan - frame.loc[15:20, -2:] = np.nan + frame.iloc[5:10] = np.nan + frame.iloc[15:20, -2:] = np.nan for skipna in [True, False]: for axis in [0, 1]: for df in [frame, int_frame]: @@ -922,14 +921,14 @@ def test_idxmin(self, float_frame, int_frame): expected = df.apply(Series.idxmin, axis=axis, skipna=skipna) tm.assert_series_equal(result, expected) - msg = "No axis named 2 for object type " + msg = "No axis named 2 for object type DataFrame" with pytest.raises(ValueError, match=msg): frame.idxmin(axis=2) def test_idxmax(self, float_frame, int_frame): frame = float_frame - frame.loc[5:10] = np.nan - frame.loc[15:20, -2:] = np.nan + frame.iloc[5:10] = np.nan + frame.iloc[15:20, -2:] = np.nan for skipna in [True, False]: for axis in [0, 1]: for df in [frame, int_frame]: @@ -937,7 +936,7 @@ def test_idxmax(self, float_frame, int_frame): expected = df.apply(Series.idxmax, axis=axis, skipna=skipna) tm.assert_series_equal(result, expected) - msg = "No axis named 2 for object type " + msg = "No axis named 2 for object type DataFrame" with pytest.raises(ValueError, match=msg): frame.idxmax(axis=2) @@ -1148,59 +1147,6 @@ def test_any_all_level_axis_none_raises(self, method): # --------------------------------------------------------------------- # Matrix-like - def test_dot(self): - a = DataFrame( - np.random.randn(3, 4), index=["a", "b", "c"], columns=["p", "q", "r", "s"] - ) - b = DataFrame( - np.random.randn(4, 2), index=["p", "q", "r", "s"], columns=["one", "two"] - ) - - result = a.dot(b) - expected = DataFrame( - np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] - ) - # Check alignment - b1 = b.reindex(index=reversed(b.index)) - result = a.dot(b) - tm.assert_frame_equal(result, expected) - - # Check series argument - result = a.dot(b["one"]) - tm.assert_series_equal(result, expected["one"], check_names=False) - assert result.name is None - - result = a.dot(b1["one"]) - tm.assert_series_equal(result, expected["one"], check_names=False) - assert result.name is None - - # can pass correct-length arrays - row = a.iloc[0].values - - result = a.dot(row) - expected = a.dot(a.iloc[0]) - tm.assert_series_equal(result, expected) - - with pytest.raises(ValueError, match="Dot product shape mismatch"): - a.dot(row[:-1]) - - a = np.random.rand(1, 5) - b = np.random.rand(5, 1) - A = DataFrame(a) - - # TODO(wesm): unused - B = DataFrame(b) # noqa - - # it works - result = A.dot(b) - - # unaligned - df = DataFrame(np.random.randn(3, 4), index=[1, 2, 3], columns=range(4)) - df2 = DataFrame(np.random.randn(5, 3), index=range(5), columns=[1, 2, 3]) - - with pytest.raises(ValueError, match="aligned"): - df.dot(df2) - def test_matmul(self): # matmul test is for GH 10259 a = DataFrame( @@ -1275,3 +1221,28 @@ def test_series_broadcasting(self): df_nan.clip(lower=s, axis=0) for op in ["lt", "le", "gt", "ge", "eq", "ne"]: getattr(df, op)(s_nan, axis=0) + + +class TestDataFrameReductions: + def test_min_max_dt64_with_NaT(self): + # Both NaT and Timestamp are in DataFrame. + df = pd.DataFrame({"foo": [pd.NaT, pd.NaT, pd.Timestamp("2012-05-01")]}) + + res = df.min() + exp = pd.Series([pd.Timestamp("2012-05-01")], index=["foo"]) + tm.assert_series_equal(res, exp) + + res = df.max() + exp = pd.Series([pd.Timestamp("2012-05-01")], index=["foo"]) + tm.assert_series_equal(res, exp) + + # GH12941, only NaTs are in DataFrame. + df = pd.DataFrame({"foo": [pd.NaT, pd.NaT]}) + + res = df.min() + exp = pd.Series([pd.NaT], index=["foo"]) + tm.assert_series_equal(res, exp) + + res = df.max() + exp = pd.Series([pd.NaT], index=["foo"]) + tm.assert_series_equal(res, exp) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index a021dd91a7d26..4149485be181d 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -127,6 +127,14 @@ def test_not_hashable(self): with pytest.raises(TypeError, match=msg): hash(empty_frame) + def test_column_name_contains_unicode_surrogate(self): + # GH 25509 + colname = "\ud83d" + df = DataFrame({colname: []}) + # this should not crash + assert colname not in dir(df) + assert df.columns[0] == colname + def test_new_empty_index(self): df1 = DataFrame(np.random.randn(0, 3)) df2 = DataFrame(np.random.randn(0, 3)) @@ -363,10 +371,7 @@ def test_swapaxes(self): tm.assert_frame_equal(df.T, df.swapaxes(0, 1)) tm.assert_frame_equal(df.T, df.swapaxes(1, 0)) tm.assert_frame_equal(df, df.swapaxes(0, 0)) - msg = ( - "No axis named 2 for object type " - r"" - ) + msg = "No axis named 2 for object type DataFrame" with pytest.raises(ValueError, match=msg): df.swapaxes(2, 5) @@ -535,3 +540,21 @@ def test_attrs(self): result = df.rename(columns=str) assert result.attrs == {"version": 1} + + def test_cache_on_copy(self): + # GH 31784 _item_cache not cleared on copy causes incorrect reads after updates + df = DataFrame({"a": [1]}) + + df["x"] = [0] + df["a"] + + df.copy() + + df["a"].values[0] = -1 + + tm.assert_frame_equal(df, DataFrame({"a": [-1], "x": [0]})) + + df["y"] = [0] + + assert df["a"].values[0] == -1 + tm.assert_frame_equal(df, DataFrame({"a": [-1], "x": [0], "y": [0]})) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index fe6abef97acc4..e328523253144 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -12,7 +12,6 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range, notna import pandas._testing as tm -from pandas.conftest import _get_cython_table_params from pandas.core.apply import frame_apply from pandas.core.base import SpecificationError @@ -49,7 +48,8 @@ def test_apply(self, float_frame): # invalid axis df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) - with pytest.raises(ValueError): + msg = "No axis named 2 for object type DataFrame" + with pytest.raises(ValueError, match=msg): df.apply(lambda x: x, 2) # GH 9573 @@ -221,7 +221,8 @@ def test_apply_broadcast_error(self, int_frame_const_col): df = int_frame_const_col # > 1 ndim - with pytest.raises(ValueError): + msg = "too many dims to broadcast" + with pytest.raises(ValueError, match=msg): df.apply( lambda x: np.array([1, 2]).reshape(-1, 2), axis=1, @@ -229,13 +230,21 @@ def test_apply_broadcast_error(self, int_frame_const_col): ) # cannot broadcast - with pytest.raises(ValueError): + msg = "cannot broadcast result" + with pytest.raises(ValueError, match=msg): df.apply(lambda x: [1, 2], axis=1, result_type="broadcast") - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): df.apply(lambda x: Series([1, 2]), axis=1, result_type="broadcast") - def test_apply_raw(self, float_frame): + def test_apply_raw(self, float_frame, mixed_type_frame): + def _assert_raw(x): + assert isinstance(x, np.ndarray) + assert x.ndim == 1 + + float_frame.apply(_assert_raw, raw=True) + float_frame.apply(_assert_raw, axis=1, raw=True) + result0 = float_frame.apply(np.mean, raw=True) result1 = float_frame.apply(np.mean, axis=1, raw=True) @@ -250,6 +259,10 @@ def test_apply_raw(self, float_frame): expected = float_frame * 2 tm.assert_frame_equal(result, expected) + # Mixed dtype (GH-32423) + mixed_type_frame.apply(_assert_raw, raw=True) + mixed_type_frame.apply(_assert_raw, axis=1, raw=True) + def test_apply_axis1(self, float_frame): d = float_frame.index[0] tapplied = float_frame.apply(np.mean, axis=1) @@ -339,7 +352,7 @@ def test_apply_yield_list(self, float_frame): tm.assert_frame_equal(result, float_frame) def test_apply_reduce_Series(self, float_frame): - float_frame.loc[::2, "A"] = np.nan + float_frame["A"].iloc[::2] = np.nan expected = float_frame.mean(1) result = float_frame.apply(np.mean, axis=1) tm.assert_series_equal(result, expected) @@ -939,7 +952,11 @@ def test_result_type_error(self, result_type, int_frame_const_col): # allowed result_type df = int_frame_const_col - with pytest.raises(ValueError): + msg = ( + "invalid value for result_type, must be one of " + "{None, 'reduce', 'broadcast', 'expand'}" + ) + with pytest.raises(ValueError, match=msg): df.apply(lambda x: [1, 2, 3], axis=1, result_type=result_type) @pytest.mark.parametrize( @@ -1035,14 +1052,16 @@ def test_agg_transform(self, axis, float_frame): def test_transform_and_agg_err(self, axis, float_frame): # cannot both transform and agg - with pytest.raises(ValueError): + msg = "transforms cannot produce aggregated results" + with pytest.raises(ValueError, match=msg): float_frame.transform(["max", "min"], axis=axis) - with pytest.raises(ValueError): + msg = "cannot combine transform and aggregation operations" + with pytest.raises(ValueError, match=msg): with np.errstate(all="ignore"): float_frame.agg(["max", "sqrt"], axis=axis) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): with np.errstate(all="ignore"): float_frame.transform(["max", "sqrt"], axis=axis) @@ -1303,7 +1322,7 @@ def func(group_col): @pytest.mark.parametrize( "df, func, expected", chain( - _get_cython_table_params( + tm.get_cython_table_params( DataFrame(), [ ("sum", Series(dtype="float64")), @@ -1318,7 +1337,7 @@ def func(group_col): ("median", Series(dtype="float64")), ], ), - _get_cython_table_params( + tm.get_cython_table_params( DataFrame([[np.nan, 1], [1, 2]]), [ ("sum", Series([1.0, 3])), @@ -1345,10 +1364,10 @@ def test_agg_cython_table(self, df, func, expected, axis): @pytest.mark.parametrize( "df, func, expected", chain( - _get_cython_table_params( + tm.get_cython_table_params( DataFrame(), [("cumprod", DataFrame()), ("cumsum", DataFrame())] ), - _get_cython_table_params( + tm.get_cython_table_params( DataFrame([[np.nan, 1], [1, 2]]), [ ("cumprod", DataFrame([[np.nan, 1], [1, 2]])), @@ -1370,13 +1389,14 @@ def test_agg_cython_table_transform(self, df, func, expected, axis): @pytest.mark.parametrize( "df, func, expected", - _get_cython_table_params( + tm.get_cython_table_params( DataFrame([["a", "b"], ["b", "a"]]), [["cumprod", TypeError]] ), ) def test_agg_cython_table_raises(self, df, func, expected, axis): # GH 21224 - with pytest.raises(expected): + msg = "can't multiply sequence by non-int of type 'str'" + with pytest.raises(expected, match=msg): df.agg(func, axis=axis) @pytest.mark.parametrize("num_cols", [2, 3, 5]) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 44ad55517dcea..d929d3e030508 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1,12 +1,16 @@ from collections import deque from datetime import datetime import operator +import re import numpy as np import pytest +import pytz import pandas as pd +from pandas import DataFrame, MultiIndex, Series import pandas._testing as tm +import pandas.core.common as com from pandas.tests.frame.common import _check_mixed_float, _check_mixed_int # ------------------------------------------------------------------- @@ -45,13 +49,16 @@ def check(df, df2): ) tm.assert_frame_equal(result, expected) - with pytest.raises(TypeError): + msg = re.escape( + "Invalid comparison between dtype=datetime64[ns] and ndarray" + ) + with pytest.raises(TypeError, match=msg): x >= y - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): x > y - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): x < y - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): x <= y # GH4968 @@ -97,9 +104,13 @@ def test_timestamp_compare(self): result = right_f(pd.Timestamp("20010109"), df) tm.assert_frame_equal(result, expected) else: - with pytest.raises(TypeError): + msg = ( + "'(<|>)=?' not supported between " + "instances of 'Timestamp' and 'float'" + ) + with pytest.raises(TypeError, match=msg): left_f(df, pd.Timestamp("20010109")) - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): right_f(pd.Timestamp("20010109"), df) # nats expected = left_f(df, pd.Timestamp("nat")) @@ -347,6 +358,25 @@ def test_floordiv_axis0(self): result2 = df.floordiv(ser.values, axis=0) tm.assert_frame_equal(result2, expected) + @pytest.mark.slow + @pytest.mark.parametrize("opname", ["floordiv", "pow"]) + def test_floordiv_axis0_numexpr_path(self, opname): + # case that goes through numexpr and has to fall back to masked_arith_op + op = getattr(operator, opname) + + arr = np.arange(10 ** 6).reshape(100, -1) + df = pd.DataFrame(arr) + df["C"] = 1.0 + + ser = df[0] + result = getattr(df, opname)(ser, axis=0) + + expected = pd.DataFrame({col: op(df[col], ser) for col in df.columns}) + tm.assert_frame_equal(result, expected) + + result2 = getattr(df, opname)(ser.values, axis=0) + tm.assert_frame_equal(result2, expected) + def test_df_add_td64_columnwise(self): # GH 22534 Check that column-wise addition broadcasts correctly dti = pd.date_range("2016-01-01", periods=10) @@ -502,6 +532,15 @@ def test_arith_flex_zero_len_raises(self): with pytest.raises(NotImplementedError, match="fill_value"): df_len0.sub(df["A"], axis=None, fill_value=3) + def test_flex_add_scalar_fill_value(self): + # GH#12723 + dat = np.array([0, 1, np.nan, 3, 4, 5], dtype="float") + df = pd.DataFrame({"foo": dat}, index=range(6)) + + exp = df.fillna(0).add(2) + res = df.add(2, fill_value=0) + tm.assert_frame_equal(res, exp) + class TestFrameArithmetic: def test_td64_op_nat_casting(self): @@ -771,3 +810,682 @@ def test_frame_single_columns_object_sum_axis_1(): result = df.sum(axis=1) expected = pd.Series(["A", 1.2, 0]) tm.assert_series_equal(result, expected) + + +# ------------------------------------------------------------------- +# Unsorted +# These arithmetic tests were previously in other files, eventually +# should be parametrized and put into tests.arithmetic + + +class TestFrameArithmeticUnsorted: + def test_frame_add_tz_mismatch_converts_to_utc(self): + rng = pd.date_range("1/1/2011", periods=10, freq="H", tz="US/Eastern") + df = pd.DataFrame(np.random.randn(len(rng)), index=rng, columns=["a"]) + + df_moscow = df.tz_convert("Europe/Moscow") + result = df + df_moscow + assert result.index.tz is pytz.utc + + result = df_moscow + df + assert result.index.tz is pytz.utc + + def test_align_frame(self): + rng = pd.period_range("1/1/2000", "1/1/2010", freq="A") + ts = pd.DataFrame(np.random.randn(len(rng), 3), index=rng) + + result = ts + ts[::2] + expected = ts + ts + expected.values[1::2] = np.nan + tm.assert_frame_equal(result, expected) + + half = ts[::2] + result = ts + half.take(np.random.permutation(len(half))) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "op", [operator.add, operator.sub, operator.mul, operator.truediv] + ) + def test_operators_none_as_na(self, op): + df = DataFrame( + {"col1": [2, 5.0, 123, None], "col2": [1, 2, 3, 4]}, dtype=object + ) + + # since filling converts dtypes from object, changed expected to be + # object + filled = df.fillna(np.nan) + result = op(df, 3) + expected = op(filled, 3).astype(object) + expected[com.isna(expected)] = None + tm.assert_frame_equal(result, expected) + + result = op(df, df) + expected = op(filled, filled).astype(object) + expected[com.isna(expected)] = None + tm.assert_frame_equal(result, expected) + + result = op(df, df.fillna(7)) + tm.assert_frame_equal(result, expected) + + result = op(df.fillna(7), df) + tm.assert_frame_equal(result, expected, check_dtype=False) + + @pytest.mark.parametrize("op,res", [("__eq__", False), ("__ne__", True)]) + # TODO: not sure what's correct here. + @pytest.mark.filterwarnings("ignore:elementwise:FutureWarning") + def test_logical_typeerror_with_non_valid(self, op, res, float_frame): + # we are comparing floats vs a string + result = getattr(float_frame, op)("foo") + assert bool(result.all().all()) is res + + def test_binary_ops_align(self): + + # test aligning binary ops + + # GH 6681 + index = MultiIndex.from_product( + [list("abc"), ["one", "two", "three"], [1, 2, 3]], + names=["first", "second", "third"], + ) + + df = DataFrame( + np.arange(27 * 3).reshape(27, 3), + index=index, + columns=["value1", "value2", "value3"], + ).sort_index() + + idx = pd.IndexSlice + for op in ["add", "sub", "mul", "div", "truediv"]: + opa = getattr(operator, op, None) + if opa is None: + continue + + x = Series([1.0, 10.0, 100.0], [1, 2, 3]) + result = getattr(df, op)(x, level="third", axis=0) + + expected = pd.concat( + [opa(df.loc[idx[:, :, i], :], v) for i, v in x.items()] + ).sort_index() + tm.assert_frame_equal(result, expected) + + x = Series([1.0, 10.0], ["two", "three"]) + result = getattr(df, op)(x, level="second", axis=0) + + expected = ( + pd.concat([opa(df.loc[idx[:, i], :], v) for i, v in x.items()]) + .reindex_like(df) + .sort_index() + ) + tm.assert_frame_equal(result, expected) + + # GH9463 (alignment level of dataframe with series) + + midx = MultiIndex.from_product([["A", "B"], ["a", "b"]]) + df = DataFrame(np.ones((2, 4), dtype="int64"), columns=midx) + s = pd.Series({"a": 1, "b": 2}) + + df2 = df.copy() + df2.columns.names = ["lvl0", "lvl1"] + s2 = s.copy() + s2.index.name = "lvl1" + + # different cases of integer/string level names: + res1 = df.mul(s, axis=1, level=1) + res2 = df.mul(s2, axis=1, level=1) + res3 = df2.mul(s, axis=1, level=1) + res4 = df2.mul(s2, axis=1, level=1) + res5 = df2.mul(s, axis=1, level="lvl1") + res6 = df2.mul(s2, axis=1, level="lvl1") + + exp = DataFrame( + np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype="int64"), columns=midx + ) + + for res in [res1, res2]: + tm.assert_frame_equal(res, exp) + + exp.columns.names = ["lvl0", "lvl1"] + for res in [res3, res4, res5, res6]: + tm.assert_frame_equal(res, exp) + + def test_add_with_dti_mismatched_tzs(self): + base = pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], tz="UTC") + idx1 = base.tz_convert("Asia/Tokyo")[:2] + idx2 = base.tz_convert("US/Eastern")[1:] + + df1 = DataFrame({"A": [1, 2]}, index=idx1) + df2 = DataFrame({"A": [1, 1]}, index=idx2) + exp = DataFrame({"A": [np.nan, 3, np.nan]}, index=base) + tm.assert_frame_equal(df1 + df2, exp) + + def test_combineFrame(self, float_frame, mixed_float_frame, mixed_int_frame): + frame_copy = float_frame.reindex(float_frame.index[::2]) + + del frame_copy["D"] + frame_copy["C"][:5] = np.nan + + added = float_frame + frame_copy + + indexer = added["A"].dropna().index + exp = (float_frame["A"] * 2).copy() + + tm.assert_series_equal(added["A"].dropna(), exp.loc[indexer]) + + exp.loc[~exp.index.isin(indexer)] = np.nan + tm.assert_series_equal(added["A"], exp.loc[added["A"].index]) + + assert np.isnan(added["C"].reindex(frame_copy.index)[:5]).all() + + # assert(False) + + assert np.isnan(added["D"]).all() + + self_added = float_frame + float_frame + tm.assert_index_equal(self_added.index, float_frame.index) + + added_rev = frame_copy + float_frame + assert np.isnan(added["D"]).all() + assert np.isnan(added_rev["D"]).all() + + # corner cases + + # empty + plus_empty = float_frame + DataFrame() + assert np.isnan(plus_empty.values).all() + + empty_plus = DataFrame() + float_frame + assert np.isnan(empty_plus.values).all() + + empty_empty = DataFrame() + DataFrame() + assert empty_empty.empty + + # out of order + reverse = float_frame.reindex(columns=float_frame.columns[::-1]) + + tm.assert_frame_equal(reverse + float_frame, float_frame * 2) + + # mix vs float64, upcast + added = float_frame + mixed_float_frame + _check_mixed_float(added, dtype="float64") + added = mixed_float_frame + float_frame + _check_mixed_float(added, dtype="float64") + + # mix vs mix + added = mixed_float_frame + mixed_float_frame + _check_mixed_float(added, dtype=dict(C=None)) + + # with int + added = float_frame + mixed_int_frame + _check_mixed_float(added, dtype="float64") + + def test_combine_series( + self, float_frame, mixed_float_frame, mixed_int_frame, datetime_frame + ): + + # Series + series = float_frame.xs(float_frame.index[0]) + + added = float_frame + series + + for key, s in added.items(): + tm.assert_series_equal(s, float_frame[key] + series[key]) + + larger_series = series.to_dict() + larger_series["E"] = 1 + larger_series = Series(larger_series) + larger_added = float_frame + larger_series + + for key, s in float_frame.items(): + tm.assert_series_equal(larger_added[key], s + series[key]) + assert "E" in larger_added + assert np.isnan(larger_added["E"]).all() + + # no upcast needed + added = mixed_float_frame + series + _check_mixed_float(added) + + # vs mix (upcast) as needed + added = mixed_float_frame + series.astype("float32") + _check_mixed_float(added, dtype=dict(C=None)) + added = mixed_float_frame + series.astype("float16") + _check_mixed_float(added, dtype=dict(C=None)) + + # FIXME: don't leave commented-out + # these raise with numexpr.....as we are adding an int64 to an + # uint64....weird vs int + + # added = mixed_int_frame + (100*series).astype('int64') + # _check_mixed_int(added, dtype = dict(A = 'int64', B = 'float64', C = + # 'int64', D = 'int64')) + # added = mixed_int_frame + (100*series).astype('int32') + # _check_mixed_int(added, dtype = dict(A = 'int32', B = 'float64', C = + # 'int32', D = 'int64')) + + # TimeSeries + ts = datetime_frame["A"] + + # 10890 + # we no longer allow auto timeseries broadcasting + # and require explicit broadcasting + added = datetime_frame.add(ts, axis="index") + + for key, col in datetime_frame.items(): + result = col + ts + tm.assert_series_equal(added[key], result, check_names=False) + assert added[key].name == key + if col.name == ts.name: + assert result.name == "A" + else: + assert result.name is None + + smaller_frame = datetime_frame[:-5] + smaller_added = smaller_frame.add(ts, axis="index") + + tm.assert_index_equal(smaller_added.index, datetime_frame.index) + + smaller_ts = ts[:-5] + smaller_added2 = datetime_frame.add(smaller_ts, axis="index") + tm.assert_frame_equal(smaller_added, smaller_added2) + + # length 0, result is all-nan + result = datetime_frame.add(ts[:0], axis="index") + expected = DataFrame( + np.nan, index=datetime_frame.index, columns=datetime_frame.columns + ) + tm.assert_frame_equal(result, expected) + + # Frame is all-nan + result = datetime_frame[:0].add(ts, axis="index") + expected = DataFrame( + np.nan, index=datetime_frame.index, columns=datetime_frame.columns + ) + tm.assert_frame_equal(result, expected) + + # empty but with non-empty index + frame = datetime_frame[:1].reindex(columns=[]) + result = frame.mul(ts, axis="index") + assert len(result) == len(ts) + + def test_combineFunc(self, float_frame, mixed_float_frame): + result = float_frame * 2 + tm.assert_numpy_array_equal(result.values, float_frame.values * 2) + + # vs mix + result = mixed_float_frame * 2 + for c, s in result.items(): + tm.assert_numpy_array_equal(s.values, mixed_float_frame[c].values * 2) + _check_mixed_float(result, dtype=dict(C=None)) + + result = DataFrame() * 2 + assert result.index.equals(DataFrame().index) + assert len(result.columns) == 0 + + def test_comparisons(self, simple_frame, float_frame): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame() + + row = simple_frame.xs("a") + ndim_5 = np.ones(df1.shape + (1, 1, 1)) + + def test_comp(func): + result = func(df1, df2) + tm.assert_numpy_array_equal(result.values, func(df1.values, df2.values)) + + msg = ( + "Unable to coerce to Series/DataFrame, " + "dimension must be <= 2: (30, 4, 1, 1, 1)" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + func(df1, ndim_5) + + result2 = func(simple_frame, row) + tm.assert_numpy_array_equal( + result2.values, func(simple_frame.values, row.values) + ) + + result3 = func(float_frame, 0) + tm.assert_numpy_array_equal(result3.values, func(float_frame.values, 0)) + + msg = "Can only compare identically-labeled DataFrame" + with pytest.raises(ValueError, match=msg): + func(simple_frame, simple_frame[:2]) + + test_comp(operator.eq) + test_comp(operator.ne) + test_comp(operator.lt) + test_comp(operator.gt) + test_comp(operator.ge) + test_comp(operator.le) + + def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne): + # GH 11565 + df = DataFrame( + {x: {"x": "foo", "y": "bar", "z": "baz"} for x in ["a", "b", "c"]} + ) + + f = getattr(operator, compare_operators_no_eq_ne) + msg = "'[<>]=?' not supported between instances of 'str' and 'int'" + with pytest.raises(TypeError, match=msg): + f(df, 0) + + def test_comparison_protected_from_errstate(self): + missing_df = tm.makeDataFrame() + missing_df.iloc[0]["A"] = np.nan + with np.errstate(invalid="ignore"): + expected = missing_df.values < 0 + with np.errstate(invalid="raise"): + result = (missing_df < 0).values + tm.assert_numpy_array_equal(result, expected) + + def test_boolean_comparison(self): + + # GH 4576 + # boolean comparisons with a tuple/list give unexpected results + df = DataFrame(np.arange(6).reshape((3, 2))) + b = np.array([2, 2]) + b_r = np.atleast_2d([2, 2]) + b_c = b_r.T + lst = [2, 2, 2] + tup = tuple(lst) + + # gt + expected = DataFrame([[False, False], [False, True], [True, True]]) + result = df > b + tm.assert_frame_equal(result, expected) + + result = df.values > b + tm.assert_numpy_array_equal(result, expected.values) + + msg1d = "Unable to coerce to Series, length must be 2: given 3" + msg2d = "Unable to coerce to DataFrame, shape must be" + msg2db = "operands could not be broadcast together with shapes" + with pytest.raises(ValueError, match=msg1d): + # wrong shape + df > lst + + with pytest.raises(ValueError, match=msg1d): + # wrong shape + result = df > tup + + # broadcasts like ndarray (GH#23000) + result = df > b_r + tm.assert_frame_equal(result, expected) + + result = df.values > b_r + tm.assert_numpy_array_equal(result, expected.values) + + with pytest.raises(ValueError, match=msg2d): + df > b_c + + with pytest.raises(ValueError, match=msg2db): + df.values > b_c + + # == + expected = DataFrame([[False, False], [True, False], [False, False]]) + result = df == b + tm.assert_frame_equal(result, expected) + + with pytest.raises(ValueError, match=msg1d): + result = df == lst + + with pytest.raises(ValueError, match=msg1d): + result = df == tup + + # broadcasts like ndarray (GH#23000) + result = df == b_r + tm.assert_frame_equal(result, expected) + + result = df.values == b_r + tm.assert_numpy_array_equal(result, expected.values) + + with pytest.raises(ValueError, match=msg2d): + df == b_c + + assert df.values.shape != b_c.shape + + # with alignment + df = DataFrame( + np.arange(6).reshape((3, 2)), columns=list("AB"), index=list("abc") + ) + expected.index = df.index + expected.columns = df.columns + + with pytest.raises(ValueError, match=msg1d): + result = df == lst + + with pytest.raises(ValueError, match=msg1d): + result = df == tup + + def test_inplace_ops_alignment(self): + + # inplace ops / ops alignment + # GH 8511 + + columns = list("abcdefg") + X_orig = DataFrame( + np.arange(10 * len(columns)).reshape(-1, len(columns)), + columns=columns, + index=range(10), + ) + Z = 100 * X_orig.iloc[:, 1:-1].copy() + block1 = list("bedcf") + subs = list("bcdef") + + # add + X = X_orig.copy() + result1 = (X[block1] + Z).reindex(columns=subs) + + X[block1] += Z + result2 = X.reindex(columns=subs) + + X = X_orig.copy() + result3 = (X[block1] + Z[block1]).reindex(columns=subs) + + X[block1] += Z[block1] + result4 = X.reindex(columns=subs) + + tm.assert_frame_equal(result1, result2) + tm.assert_frame_equal(result1, result3) + tm.assert_frame_equal(result1, result4) + + # sub + X = X_orig.copy() + result1 = (X[block1] - Z).reindex(columns=subs) + + X[block1] -= Z + result2 = X.reindex(columns=subs) + + X = X_orig.copy() + result3 = (X[block1] - Z[block1]).reindex(columns=subs) + + X[block1] -= Z[block1] + result4 = X.reindex(columns=subs) + + tm.assert_frame_equal(result1, result2) + tm.assert_frame_equal(result1, result3) + tm.assert_frame_equal(result1, result4) + + def test_inplace_ops_identity(self): + + # GH 5104 + # make sure that we are actually changing the object + s_orig = Series([1, 2, 3]) + df_orig = DataFrame(np.random.randint(0, 5, size=10).reshape(-1, 5)) + + # no dtype change + s = s_orig.copy() + s2 = s + s += 1 + tm.assert_series_equal(s, s2) + tm.assert_series_equal(s_orig + 1, s) + assert s is s2 + assert s._mgr is s2._mgr + + df = df_orig.copy() + df2 = df + df += 1 + tm.assert_frame_equal(df, df2) + tm.assert_frame_equal(df_orig + 1, df) + assert df is df2 + assert df._mgr is df2._mgr + + # dtype change + s = s_orig.copy() + s2 = s + s += 1.5 + tm.assert_series_equal(s, s2) + tm.assert_series_equal(s_orig + 1.5, s) + + df = df_orig.copy() + df2 = df + df += 1.5 + tm.assert_frame_equal(df, df2) + tm.assert_frame_equal(df_orig + 1.5, df) + assert df is df2 + assert df._mgr is df2._mgr + + # mixed dtype + arr = np.random.randint(0, 10, size=5) + df_orig = DataFrame({"A": arr.copy(), "B": "foo"}) + df = df_orig.copy() + df2 = df + df["A"] += 1 + expected = DataFrame({"A": arr.copy() + 1, "B": "foo"}) + tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df2, expected) + assert df._mgr is df2._mgr + + df = df_orig.copy() + df2 = df + df["A"] += 1.5 + expected = DataFrame({"A": arr.copy() + 1.5, "B": "foo"}) + tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df2, expected) + assert df._mgr is df2._mgr + + @pytest.mark.parametrize( + "op", + [ + "add", + "and", + "div", + "floordiv", + "mod", + "mul", + "or", + "pow", + "sub", + "truediv", + "xor", + ], + ) + def test_inplace_ops_identity2(self, op): + + if op == "div": + return + + df = DataFrame({"a": [1.0, 2.0, 3.0], "b": [1, 2, 3]}) + + operand = 2 + if op in ("and", "or", "xor"): + # cannot use floats for boolean ops + df["a"] = [True, False, True] + + df_copy = df.copy() + iop = f"__i{op}__" + op = f"__{op}__" + + # no id change and value is correct + getattr(df, iop)(operand) + expected = getattr(df_copy, op)(operand) + tm.assert_frame_equal(df, expected) + expected = id(df) + assert id(df) == expected + + def test_alignment_non_pandas(self): + index = ["A", "B", "C"] + columns = ["X", "Y", "Z"] + df = pd.DataFrame(np.random.randn(3, 3), index=index, columns=columns) + + align = pd.core.ops._align_method_FRAME + for val in [ + [1, 2, 3], + (1, 2, 3), + np.array([1, 2, 3], dtype=np.int64), + range(1, 4), + ]: + + tm.assert_series_equal( + align(df, val, "index")[1], Series([1, 2, 3], index=df.index) + ) + tm.assert_series_equal( + align(df, val, "columns")[1], Series([1, 2, 3], index=df.columns) + ) + + # length mismatch + msg = "Unable to coerce to Series, length must be 3: given 2" + for val in [[1, 2], (1, 2), np.array([1, 2]), range(1, 3)]: + + with pytest.raises(ValueError, match=msg): + align(df, val, "index") + + with pytest.raises(ValueError, match=msg): + align(df, val, "columns") + + val = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + tm.assert_frame_equal( + align(df, val, "index")[1], + DataFrame(val, index=df.index, columns=df.columns), + ) + tm.assert_frame_equal( + align(df, val, "columns")[1], + DataFrame(val, index=df.index, columns=df.columns), + ) + + # shape mismatch + msg = "Unable to coerce to DataFrame, shape must be" + val = np.array([[1, 2, 3], [4, 5, 6]]) + with pytest.raises(ValueError, match=msg): + align(df, val, "index") + + with pytest.raises(ValueError, match=msg): + align(df, val, "columns") + + val = np.zeros((3, 3, 3)) + msg = re.escape( + "Unable to coerce to Series/DataFrame, dimension must be <= 2: (3, 3, 3)" + ) + with pytest.raises(ValueError, match=msg): + align(df, val, "index") + with pytest.raises(ValueError, match=msg): + align(df, val, "columns") + + def test_no_warning(self, all_arithmetic_operators): + df = pd.DataFrame({"A": [0.0, 0.0], "B": [0.0, None]}) + b = df["B"] + with tm.assert_produces_warning(None): + getattr(df, all_arithmetic_operators)(b, 0) + + +def test_pow_with_realignment(): + # GH#32685 pow has special semantics for operating with null values + left = pd.DataFrame({"A": [0, 1, 2]}) + right = pd.DataFrame(index=[0, 1, 2]) + + result = left ** right + expected = pd.DataFrame({"A": [np.nan, 1.0, np.nan]}) + tm.assert_frame_equal(result, expected) + + +# TODO: move to tests.arithmetic and parametrize +def test_pow_nan_with_zero(): + left = pd.DataFrame({"A": [np.nan, np.nan, np.nan]}) + right = pd.DataFrame({"A": [0, 0, 0]}) + + expected = pd.DataFrame({"A": [1.0, 1.0, 1.0]}) + + result = left ** right + tm.assert_frame_equal(result, expected) + + result = left["A"] ** right["A"] + tm.assert_series_equal(result, expected["A"]) diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 7effa98fd8213..f61512b1a62d9 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -1,11 +1,8 @@ from datetime import datetime -import re import numpy as np import pytest -from pandas.errors import PerformanceWarning - import pandas as pd from pandas import Categorical, DataFrame, Index, MultiIndex, Series, date_range, isna import pandas._testing as tm @@ -15,173 +12,6 @@ class TestDataFrameSelectReindex: # These are specific reindex-based tests; other indexing tests should go in # test_indexing - def test_drop_names(self): - df = DataFrame( - [[1, 2, 3], [3, 4, 5], [5, 6, 7]], - index=["a", "b", "c"], - columns=["d", "e", "f"], - ) - df.index.name, df.columns.name = "first", "second" - df_dropped_b = df.drop("b") - df_dropped_e = df.drop("e", axis=1) - df_inplace_b, df_inplace_e = df.copy(), df.copy() - df_inplace_b.drop("b", inplace=True) - df_inplace_e.drop("e", axis=1, inplace=True) - for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e): - assert obj.index.name == "first" - assert obj.columns.name == "second" - assert list(df.columns) == ["d", "e", "f"] - - msg = r"\['g'\] not found in axis" - with pytest.raises(KeyError, match=msg): - df.drop(["g"]) - with pytest.raises(KeyError, match=msg): - df.drop(["g"], 1) - - # errors = 'ignore' - dropped = df.drop(["g"], errors="ignore") - expected = Index(["a", "b", "c"], name="first") - tm.assert_index_equal(dropped.index, expected) - - dropped = df.drop(["b", "g"], errors="ignore") - expected = Index(["a", "c"], name="first") - tm.assert_index_equal(dropped.index, expected) - - dropped = df.drop(["g"], axis=1, errors="ignore") - expected = Index(["d", "e", "f"], name="second") - tm.assert_index_equal(dropped.columns, expected) - - dropped = df.drop(["d", "g"], axis=1, errors="ignore") - expected = Index(["e", "f"], name="second") - tm.assert_index_equal(dropped.columns, expected) - - # GH 16398 - dropped = df.drop([], errors="ignore") - expected = Index(["a", "b", "c"], name="first") - tm.assert_index_equal(dropped.index, expected) - - def test_drop_col_still_multiindex(self): - arrays = [["a", "b", "c", "top"], ["", "", "", "OD"], ["", "", "", "wx"]] - - tuples = sorted(zip(*arrays)) - index = MultiIndex.from_tuples(tuples) - - df = DataFrame(np.random.randn(3, 4), columns=index) - del df[("a", "", "")] - assert isinstance(df.columns, MultiIndex) - - def test_drop(self): - simple = DataFrame({"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]}) - tm.assert_frame_equal(simple.drop("A", axis=1), simple[["B"]]) - tm.assert_frame_equal(simple.drop(["A", "B"], axis="columns"), simple[[]]) - tm.assert_frame_equal(simple.drop([0, 1, 3], axis=0), simple.loc[[2], :]) - tm.assert_frame_equal(simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :]) - - with pytest.raises(KeyError, match=r"\[5\] not found in axis"): - simple.drop(5) - with pytest.raises(KeyError, match=r"\['C'\] not found in axis"): - simple.drop("C", 1) - with pytest.raises(KeyError, match=r"\[5\] not found in axis"): - simple.drop([1, 5]) - with pytest.raises(KeyError, match=r"\['C'\] not found in axis"): - simple.drop(["A", "C"], 1) - - # errors = 'ignore' - tm.assert_frame_equal(simple.drop(5, errors="ignore"), simple) - tm.assert_frame_equal( - simple.drop([0, 5], errors="ignore"), simple.loc[[1, 2, 3], :] - ) - tm.assert_frame_equal(simple.drop("C", axis=1, errors="ignore"), simple) - tm.assert_frame_equal( - simple.drop(["A", "C"], axis=1, errors="ignore"), simple[["B"]] - ) - - # non-unique - wheee! - nu_df = DataFrame( - list(zip(range(3), range(-3, 1), list("abc"))), columns=["a", "a", "b"] - ) - tm.assert_frame_equal(nu_df.drop("a", axis=1), nu_df[["b"]]) - tm.assert_frame_equal(nu_df.drop("b", axis="columns"), nu_df["a"]) - tm.assert_frame_equal(nu_df.drop([]), nu_df) # GH 16398 - - nu_df = nu_df.set_index(pd.Index(["X", "Y", "X"])) - nu_df.columns = list("abc") - tm.assert_frame_equal(nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :]) - tm.assert_frame_equal(nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :]) - - # inplace cache issue - # GH 5628 - df = pd.DataFrame(np.random.randn(10, 3), columns=list("abc")) - expected = df[~(df.b > 0)] - df.drop(labels=df[df.b > 0].index, inplace=True) - tm.assert_frame_equal(df, expected) - - def test_drop_multiindex_not_lexsorted(self): - # GH 11640 - - # define the lexsorted version - lexsorted_mi = MultiIndex.from_tuples( - [("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"] - ) - lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) - assert lexsorted_df.columns.is_lexsorted() - - # define the non-lexsorted version - not_lexsorted_df = DataFrame( - columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]] - ) - not_lexsorted_df = not_lexsorted_df.pivot_table( - index="a", columns=["b", "c"], values="d" - ) - not_lexsorted_df = not_lexsorted_df.reset_index() - assert not not_lexsorted_df.columns.is_lexsorted() - - # compare the results - tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) - - expected = lexsorted_df.drop("a", axis=1) - with tm.assert_produces_warning(PerformanceWarning): - result = not_lexsorted_df.drop("a", axis=1) - - tm.assert_frame_equal(result, expected) - - def test_drop_api_equivalence(self): - # equivalence of the labels/axis and index/columns API's (GH12392) - df = DataFrame( - [[1, 2, 3], [3, 4, 5], [5, 6, 7]], - index=["a", "b", "c"], - columns=["d", "e", "f"], - ) - - res1 = df.drop("a") - res2 = df.drop(index="a") - tm.assert_frame_equal(res1, res2) - - res1 = df.drop("d", 1) - res2 = df.drop(columns="d") - tm.assert_frame_equal(res1, res2) - - res1 = df.drop(labels="e", axis=1) - res2 = df.drop(columns="e") - tm.assert_frame_equal(res1, res2) - - res1 = df.drop(["a"], axis=0) - res2 = df.drop(index=["a"]) - tm.assert_frame_equal(res1, res2) - - res1 = df.drop(["a"], axis=0).drop(["d"], axis=1) - res2 = df.drop(index=["a"], columns=["d"]) - tm.assert_frame_equal(res1, res2) - - with pytest.raises(ValueError): - df.drop(labels="a", index="b") - - with pytest.raises(ValueError): - df.drop(labels="a", columns="b") - - with pytest.raises(ValueError): - df.drop(axis=1) - def test_merge_join_different_levels(self): # GH 9455 @@ -325,11 +155,6 @@ def test_reindex_int(self, int_frame): smaller = int_frame.reindex(columns=["A", "B"]) assert smaller["A"].dtype == np.int64 - def test_reindex_like(self, float_frame): - other = float_frame.reindex(index=float_frame.index[:10], columns=["C", "B"]) - - tm.assert_frame_equal(other, float_frame.reindex_like(other)) - def test_reindex_columns(self, float_frame): new_frame = float_frame.reindex(columns=["A", "B", "E"]) @@ -556,187 +381,6 @@ def test_reindex_api_equivalence(self): for res in [res2, res3]: tm.assert_frame_equal(res1, res) - def test_align_float(self, float_frame): - af, bf = float_frame.align(float_frame) - assert af._data is not float_frame._data - - af, bf = float_frame.align(float_frame, copy=False) - assert af._data is float_frame._data - - # axis = 0 - other = float_frame.iloc[:-5, :3] - af, bf = float_frame.align(other, axis=0, fill_value=-1) - - tm.assert_index_equal(bf.columns, other.columns) - - # test fill value - join_idx = float_frame.index.join(other.index) - diff_a = float_frame.index.difference(join_idx) - diff_b = other.index.difference(join_idx) - diff_a_vals = af.reindex(diff_a).values - diff_b_vals = bf.reindex(diff_b).values - assert (diff_a_vals == -1).all() - - af, bf = float_frame.align(other, join="right", axis=0) - tm.assert_index_equal(bf.columns, other.columns) - tm.assert_index_equal(bf.index, other.index) - tm.assert_index_equal(af.index, other.index) - - # axis = 1 - other = float_frame.iloc[:-5, :3].copy() - af, bf = float_frame.align(other, axis=1) - tm.assert_index_equal(bf.columns, float_frame.columns) - tm.assert_index_equal(bf.index, other.index) - - # test fill value - join_idx = float_frame.index.join(other.index) - diff_a = float_frame.index.difference(join_idx) - diff_b = other.index.difference(join_idx) - diff_a_vals = af.reindex(diff_a).values - - # TODO(wesm): unused? - diff_b_vals = bf.reindex(diff_b).values # noqa - - assert (diff_a_vals == -1).all() - - af, bf = float_frame.align(other, join="inner", axis=1) - tm.assert_index_equal(bf.columns, other.columns) - - af, bf = float_frame.align(other, join="inner", axis=1, method="pad") - tm.assert_index_equal(bf.columns, other.columns) - - af, bf = float_frame.align( - other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=None - ) - tm.assert_index_equal(bf.index, Index([])) - - af, bf = float_frame.align( - other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 - ) - tm.assert_index_equal(bf.index, Index([])) - - # Try to align DataFrame to Series along bad axis - with pytest.raises(ValueError): - float_frame.align(af.iloc[0, :3], join="inner", axis=2) - - # align dataframe to series with broadcast or not - idx = float_frame.index - s = Series(range(len(idx)), index=idx) - - left, right = float_frame.align(s, axis=0) - tm.assert_index_equal(left.index, float_frame.index) - tm.assert_index_equal(right.index, float_frame.index) - assert isinstance(right, Series) - - left, right = float_frame.align(s, broadcast_axis=1) - tm.assert_index_equal(left.index, float_frame.index) - expected = {c: s for c in float_frame.columns} - expected = DataFrame( - expected, index=float_frame.index, columns=float_frame.columns - ) - tm.assert_frame_equal(right, expected) - - # see gh-9558 - df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - result = df[df["a"] == 2] - expected = DataFrame([[2, 5]], index=[1], columns=["a", "b"]) - tm.assert_frame_equal(result, expected) - - result = df.where(df["a"] == 2, 0) - expected = DataFrame({"a": [0, 2, 0], "b": [0, 5, 0]}) - tm.assert_frame_equal(result, expected) - - def test_align_int(self, int_frame): - # test other non-float types - other = DataFrame(index=range(5), columns=["A", "B", "C"]) - - af, bf = int_frame.align(other, join="inner", axis=1, method="pad") - tm.assert_index_equal(bf.columns, other.columns) - - def test_align_mixed_type(self, float_string_frame): - - af, bf = float_string_frame.align( - float_string_frame, join="inner", axis=1, method="pad" - ) - tm.assert_index_equal(bf.columns, float_string_frame.columns) - - def test_align_mixed_float(self, mixed_float_frame): - # mixed floats/ints - other = DataFrame(index=range(5), columns=["A", "B", "C"]) - - af, bf = mixed_float_frame.align( - other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 - ) - tm.assert_index_equal(bf.index, Index([])) - - def test_align_mixed_int(self, mixed_int_frame): - other = DataFrame(index=range(5), columns=["A", "B", "C"]) - - af, bf = mixed_int_frame.align( - other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 - ) - tm.assert_index_equal(bf.index, Index([])) - - def _check_align(self, a, b, axis, fill_axis, how, method, limit=None): - aa, ab = a.align( - b, axis=axis, join=how, method=method, limit=limit, fill_axis=fill_axis - ) - - join_index, join_columns = None, None - - ea, eb = a, b - if axis is None or axis == 0: - join_index = a.index.join(b.index, how=how) - ea = ea.reindex(index=join_index) - eb = eb.reindex(index=join_index) - - if axis is None or axis == 1: - join_columns = a.columns.join(b.columns, how=how) - ea = ea.reindex(columns=join_columns) - eb = eb.reindex(columns=join_columns) - - ea = ea.fillna(axis=fill_axis, method=method, limit=limit) - eb = eb.fillna(axis=fill_axis, method=method, limit=limit) - - tm.assert_frame_equal(aa, ea) - tm.assert_frame_equal(ab, eb) - - @pytest.mark.parametrize("meth", ["pad", "bfill"]) - @pytest.mark.parametrize("ax", [0, 1, None]) - @pytest.mark.parametrize("fax", [0, 1]) - @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"]) - def test_align_fill_method(self, how, meth, ax, fax, float_frame): - df = float_frame - self._check_align_fill(df, how, meth, ax, fax) - - def _check_align_fill(self, frame, kind, meth, ax, fax): - left = frame.iloc[0:4, :10] - right = frame.iloc[2:, 6:] - empty = frame.iloc[:0, :0] - - self._check_align(left, right, axis=ax, fill_axis=fax, how=kind, method=meth) - self._check_align( - left, right, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 - ) - - # empty left - self._check_align(empty, right, axis=ax, fill_axis=fax, how=kind, method=meth) - self._check_align( - empty, right, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 - ) - - # empty right - self._check_align(left, empty, axis=ax, fill_axis=fax, how=kind, method=meth) - self._check_align( - left, empty, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 - ) - - # both empty - self._check_align(empty, empty, axis=ax, fill_axis=fax, how=kind, method=meth) - self._check_align( - empty, empty, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 - ) - def test_align_int_fill_bug(self): # GH #910 X = np.arange(10 * 10, dtype="float64").reshape(10, 10) @@ -751,275 +395,6 @@ def test_align_int_fill_bug(self): expected = df2 - df2.mean() tm.assert_frame_equal(result, expected) - def test_align_multiindex(self): - # GH 10665 - # same test cases as test_align_multiindex in test_series.py - - midx = pd.MultiIndex.from_product( - [range(2), range(3), range(2)], names=("a", "b", "c") - ) - idx = pd.Index(range(2), name="b") - df1 = pd.DataFrame(np.arange(12, dtype="int64"), index=midx) - df2 = pd.DataFrame(np.arange(2, dtype="int64"), index=idx) - - # these must be the same results (but flipped) - res1l, res1r = df1.align(df2, join="left") - res2l, res2r = df2.align(df1, join="right") - - expl = df1 - tm.assert_frame_equal(expl, res1l) - tm.assert_frame_equal(expl, res2r) - expr = pd.DataFrame([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx) - tm.assert_frame_equal(expr, res1r) - tm.assert_frame_equal(expr, res2l) - - res1l, res1r = df1.align(df2, join="right") - res2l, res2r = df2.align(df1, join="left") - - exp_idx = pd.MultiIndex.from_product( - [range(2), range(2), range(2)], names=("a", "b", "c") - ) - expl = pd.DataFrame([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) - tm.assert_frame_equal(expl, res1l) - tm.assert_frame_equal(expl, res2r) - expr = pd.DataFrame([0, 0, 1, 1] * 2, index=exp_idx) - tm.assert_frame_equal(expr, res1r) - tm.assert_frame_equal(expr, res2l) - - def test_align_series_combinations(self): - df = pd.DataFrame({"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE")) - s = pd.Series([1, 2, 4], index=list("ABD"), name="x") - - # frame + series - res1, res2 = df.align(s, axis=0) - exp1 = pd.DataFrame( - {"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]}, - index=list("ABCDE"), - ) - exp2 = pd.Series([1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x") - - tm.assert_frame_equal(res1, exp1) - tm.assert_series_equal(res2, exp2) - - # series + frame - res1, res2 = s.align(df) - tm.assert_series_equal(res1, exp2) - tm.assert_frame_equal(res2, exp1) - - def test_filter(self, float_frame, float_string_frame): - # Items - filtered = float_frame.filter(["A", "B", "E"]) - assert len(filtered.columns) == 2 - assert "E" not in filtered - - filtered = float_frame.filter(["A", "B", "E"], axis="columns") - assert len(filtered.columns) == 2 - assert "E" not in filtered - - # Other axis - idx = float_frame.index[0:4] - filtered = float_frame.filter(idx, axis="index") - expected = float_frame.reindex(index=idx) - tm.assert_frame_equal(filtered, expected) - - # like - fcopy = float_frame.copy() - fcopy["AA"] = 1 - - filtered = fcopy.filter(like="A") - assert len(filtered.columns) == 2 - assert "AA" in filtered - - # like with ints in column names - df = DataFrame(0.0, index=[0, 1, 2], columns=[0, 1, "_A", "_B"]) - filtered = df.filter(like="_") - assert len(filtered.columns) == 2 - - # regex with ints in column names - # from PR #10384 - df = DataFrame(0.0, index=[0, 1, 2], columns=["A1", 1, "B", 2, "C"]) - expected = DataFrame( - 0.0, index=[0, 1, 2], columns=pd.Index([1, 2], dtype=object) - ) - filtered = df.filter(regex="^[0-9]+$") - tm.assert_frame_equal(filtered, expected) - - expected = DataFrame(0.0, index=[0, 1, 2], columns=[0, "0", 1, "1"]) - # shouldn't remove anything - filtered = expected.filter(regex="^[0-9]+$") - tm.assert_frame_equal(filtered, expected) - - # pass in None - with pytest.raises(TypeError, match="Must pass"): - float_frame.filter() - with pytest.raises(TypeError, match="Must pass"): - float_frame.filter(items=None) - with pytest.raises(TypeError, match="Must pass"): - float_frame.filter(axis=1) - - # test mutually exclusive arguments - with pytest.raises(TypeError, match="mutually exclusive"): - float_frame.filter(items=["one", "three"], regex="e$", like="bbi") - with pytest.raises(TypeError, match="mutually exclusive"): - float_frame.filter(items=["one", "three"], regex="e$", axis=1) - with pytest.raises(TypeError, match="mutually exclusive"): - float_frame.filter(items=["one", "three"], regex="e$") - with pytest.raises(TypeError, match="mutually exclusive"): - float_frame.filter(items=["one", "three"], like="bbi", axis=0) - with pytest.raises(TypeError, match="mutually exclusive"): - float_frame.filter(items=["one", "three"], like="bbi") - - # objects - filtered = float_string_frame.filter(like="foo") - assert "foo" in filtered - - # unicode columns, won't ascii-encode - df = float_frame.rename(columns={"B": "\u2202"}) - filtered = df.filter(like="C") - assert "C" in filtered - - def test_filter_regex_search(self, float_frame): - fcopy = float_frame.copy() - fcopy["AA"] = 1 - - # regex - filtered = fcopy.filter(regex="[A]+") - assert len(filtered.columns) == 2 - assert "AA" in filtered - - # doesn't have to be at beginning - df = DataFrame( - {"aBBa": [1, 2], "BBaBB": [1, 2], "aCCa": [1, 2], "aCCaBB": [1, 2]} - ) - - result = df.filter(regex="BB") - exp = df[[x for x in df.columns if "BB" in x]] - tm.assert_frame_equal(result, exp) - - @pytest.mark.parametrize( - "name,expected", - [ - ("a", DataFrame({"a": [1, 2]})), - ("a", DataFrame({"a": [1, 2]})), - ("あ", DataFrame({"あ": [3, 4]})), - ], - ) - def test_filter_unicode(self, name, expected): - # GH13101 - df = DataFrame({"a": [1, 2], "あ": [3, 4]}) - - tm.assert_frame_equal(df.filter(like=name), expected) - tm.assert_frame_equal(df.filter(regex=name), expected) - - @pytest.mark.parametrize("name", ["a", "a"]) - def test_filter_bytestring(self, name): - # GH13101 - df = DataFrame({b"a": [1, 2], b"b": [3, 4]}) - expected = DataFrame({b"a": [1, 2]}) - - tm.assert_frame_equal(df.filter(like=name), expected) - tm.assert_frame_equal(df.filter(regex=name), expected) - - def test_filter_corner(self): - empty = DataFrame() - - result = empty.filter([]) - tm.assert_frame_equal(result, empty) - - result = empty.filter(like="foo") - tm.assert_frame_equal(result, empty) - - def test_filter_regex_non_string(self): - # GH#5798 trying to filter on non-string columns should drop, - # not raise - df = pd.DataFrame(np.random.random((3, 2)), columns=["STRING", 123]) - result = df.filter(regex="STRING") - expected = df[["STRING"]] - tm.assert_frame_equal(result, expected) - - def test_take(self, float_frame): - # homogeneous - order = [3, 1, 2, 0] - for df in [float_frame]: - - result = df.take(order, axis=0) - expected = df.reindex(df.index.take(order)) - tm.assert_frame_equal(result, expected) - - # axis = 1 - result = df.take(order, axis=1) - expected = df.loc[:, ["D", "B", "C", "A"]] - tm.assert_frame_equal(result, expected, check_names=False) - - # negative indices - order = [2, 1, -1] - for df in [float_frame]: - - result = df.take(order, axis=0) - expected = df.reindex(df.index.take(order)) - tm.assert_frame_equal(result, expected) - - result = df.take(order, axis=0) - tm.assert_frame_equal(result, expected) - - # axis = 1 - result = df.take(order, axis=1) - expected = df.loc[:, ["C", "B", "D"]] - tm.assert_frame_equal(result, expected, check_names=False) - - # illegal indices - msg = "indices are out-of-bounds" - with pytest.raises(IndexError, match=msg): - df.take([3, 1, 2, 30], axis=0) - with pytest.raises(IndexError, match=msg): - df.take([3, 1, 2, -31], axis=0) - with pytest.raises(IndexError, match=msg): - df.take([3, 1, 2, 5], axis=1) - with pytest.raises(IndexError, match=msg): - df.take([3, 1, 2, -5], axis=1) - - def test_take_mixed_type(self, float_string_frame): - - # mixed-dtype - order = [4, 1, 2, 0, 3] - for df in [float_string_frame]: - - result = df.take(order, axis=0) - expected = df.reindex(df.index.take(order)) - tm.assert_frame_equal(result, expected) - - # axis = 1 - result = df.take(order, axis=1) - expected = df.loc[:, ["foo", "B", "C", "A", "D"]] - tm.assert_frame_equal(result, expected) - - # negative indices - order = [4, 1, -2] - for df in [float_string_frame]: - - result = df.take(order, axis=0) - expected = df.reindex(df.index.take(order)) - tm.assert_frame_equal(result, expected) - - # axis = 1 - result = df.take(order, axis=1) - expected = df.loc[:, ["foo", "B", "D"]] - tm.assert_frame_equal(result, expected) - - def test_take_mixed_numeric(self, mixed_float_frame, mixed_int_frame): - # by dtype - order = [1, 2, 0, 3] - for df in [mixed_float_frame, mixed_int_frame]: - - result = df.take(order, axis=0) - expected = df.reindex(df.index.take(order)) - tm.assert_frame_equal(result, expected) - - # axis = 1 - result = df.take(order, axis=1) - expected = df.loc[:, ["B", "C", "A", "D"]] - tm.assert_frame_equal(result, expected) - def test_reindex_boolean(self): frame = DataFrame( np.ones((10, 2), dtype=bool), index=np.arange(0, 20, 2), columns=[0, 2] @@ -1113,42 +488,23 @@ def test_reindex_multi_categorical_time(self): expected = pd.DataFrame({"a": [0, 1, 2, 3, 4, 5, 6, np.nan, 8]}, index=midx) tm.assert_frame_equal(result, expected) - data = [[1, 2, 3], [1, 2, 3]] - @pytest.mark.parametrize( - "actual", - [ - DataFrame(data=data, index=["a", "a"]), - DataFrame(data=data, index=["a", "b"]), - DataFrame(data=data, index=["a", "b"]).set_index([0, 1]), - DataFrame(data=data, index=["a", "a"]).set_index([0, 1]), - ], + "operation", ["__iadd__", "__isub__", "__imul__", "__ipow__"] ) - def test_raise_on_drop_duplicate_index(self, actual): - - # issue 19186 - level = 0 if isinstance(actual.index, MultiIndex) else None - msg = re.escape("\"['c'] not found in axis\"") - with pytest.raises(KeyError, match=msg): - actual.drop("c", level=level, axis=0) - with pytest.raises(KeyError, match=msg): - actual.T.drop("c", level=level, axis=1) - expected_no_err = actual.drop("c", axis=0, level=level, errors="ignore") - tm.assert_frame_equal(expected_no_err, actual) - expected_no_err = actual.T.drop("c", axis=1, level=level, errors="ignore") - tm.assert_frame_equal(expected_no_err.T, actual) - - @pytest.mark.parametrize("index", [[1, 2, 3], [1, 1, 2]]) - @pytest.mark.parametrize("drop_labels", [[], [1], [2]]) - def test_drop_empty_list(self, index, drop_labels): - # GH 21494 - expected_index = [i for i in index if i not in drop_labels] - frame = pd.DataFrame(index=index).drop(drop_labels) - tm.assert_frame_equal(frame, pd.DataFrame(index=expected_index)) - - @pytest.mark.parametrize("index", [[1, 2, 3], [1, 2, 2]]) - @pytest.mark.parametrize("drop_labels", [[1, 4], [4, 5]]) - def test_drop_non_empty_list(self, index, drop_labels): - # GH 21494 - with pytest.raises(KeyError, match="not found in axis"): - pd.DataFrame(index=index).drop(drop_labels) + @pytest.mark.parametrize("inplace", [False, True]) + def test_inplace_drop_and_operation(self, operation, inplace): + # GH 30484 + df = pd.DataFrame({"x": range(5)}) + expected = df.copy() + df["y"] = range(5) + y = df["y"] + + with tm.assert_produces_warning(None): + if inplace: + df.drop("y", axis=1, inplace=inplace) + else: + df = df.drop("y", axis=1, inplace=inplace) + + # Perform operation and check result + getattr(y, operation)(1) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index a5f5e6f36cd58..e2910a2eb6100 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -43,18 +43,18 @@ def test_setitem_invalidates_datetime_index_freq(self): assert dti[1] == ts def test_cast_internals(self, float_frame): - casted = DataFrame(float_frame._data, dtype=int) + casted = DataFrame(float_frame._mgr, dtype=int) expected = DataFrame(float_frame._series, dtype=int) tm.assert_frame_equal(casted, expected) - casted = DataFrame(float_frame._data, dtype=np.int32) + casted = DataFrame(float_frame._mgr, dtype=np.int32) expected = DataFrame(float_frame._series, dtype=np.int32) tm.assert_frame_equal(casted, expected) def test_consolidate(self, float_frame): float_frame["E"] = 7.0 consolidated = float_frame._consolidate() - assert len(consolidated._data.blocks) == 1 + assert len(consolidated._mgr.blocks) == 1 # Ensure copy, do I want this? recons = consolidated._consolidate() @@ -62,10 +62,10 @@ def test_consolidate(self, float_frame): tm.assert_frame_equal(recons, consolidated) float_frame["F"] = 8.0 - assert len(float_frame._data.blocks) == 3 + assert len(float_frame._mgr.blocks) == 3 float_frame._consolidate(inplace=True) - assert len(float_frame._data.blocks) == 1 + assert len(float_frame._mgr.blocks) == 1 def test_consolidate_inplace(self, float_frame): frame = float_frame.copy() # noqa @@ -76,9 +76,9 @@ def test_consolidate_inplace(self, float_frame): def test_values_consolidate(self, float_frame): float_frame["E"] = 7.0 - assert not float_frame._data.is_consolidated() + assert not float_frame._mgr.is_consolidated() _ = float_frame.values # noqa - assert float_frame._data.is_consolidated() + assert float_frame._mgr.is_consolidated() def test_modify_values(self, float_frame): float_frame.values[5] = 5 @@ -300,7 +300,7 @@ def test_equals_different_blocks(self): df1 = df0.reset_index()[["A", "B", "C"]] # this assert verifies that the above operations have # induced a block rearrangement - assert df0._data.blocks[0].dtype != df1._data.blocks[0].dtype + assert df0._mgr.blocks[0].dtype != df1._mgr.blocks[0].dtype # do the real tests tm.assert_frame_equal(df0, df1) @@ -342,7 +342,7 @@ def test_copy(self, float_frame, float_string_frame): # copy objects copy = float_string_frame.copy() - assert copy._data is not float_string_frame._data + assert copy._mgr is not float_string_frame._mgr def test_pickle(self, float_string_frame, timezone_frame): empty_frame = DataFrame() @@ -351,7 +351,7 @@ def test_pickle(self, float_string_frame, timezone_frame): tm.assert_frame_equal(float_string_frame, unpickled) # buglet - float_string_frame._data.ndim + float_string_frame._mgr.ndim # empty unpickled = tm.round_trip_pickle(empty_frame) @@ -478,7 +478,7 @@ def test_convert_objects(self, float_string_frame): length = len(float_string_frame) float_string_frame["J"] = "1." float_string_frame["K"] = "1" - float_string_frame.loc[0:5, ["J", "K"]] = "garbled" + float_string_frame.loc[float_string_frame.index[0:5], ["J", "K"]] = "garbled" converted = float_string_frame._convert(datetime=True, numeric=True) assert converted["H"].dtype == "float64" assert converted["I"].dtype == "int64" @@ -604,7 +604,7 @@ def test_constructor_no_pandas_array(self): result = pd.DataFrame({"A": arr}) expected = pd.DataFrame({"A": [1, 2, 3]}) tm.assert_frame_equal(result, expected) - assert isinstance(result._data.blocks[0], IntBlock) + assert isinstance(result._mgr.blocks[0], IntBlock) def test_add_column_with_pandas_array(self): # GH 26390 @@ -617,6 +617,6 @@ def test_add_column_with_pandas_array(self): "c": pd.arrays.PandasArray(np.array([1, 2, None, 3], dtype=object)), } ) - assert type(df["c"]._data.blocks[0]) == ObjectBlock - assert type(df2["c"]._data.blocks[0]) == ObjectBlock + assert type(df["c"]._mgr.blocks[0]) == ObjectBlock + assert type(df2["c"]._mgr.blocks[0]) == ObjectBlock tm.assert_frame_equal(df, df2) diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index 321eb5fe94daf..7eba2b873c4f4 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -8,7 +8,7 @@ import pandas._testing as tm -class TestDataFrameConcatCommon: +class TestDataFrameConcat: def test_concat_multiple_frames_dtypes(self): # GH 2759 @@ -107,77 +107,6 @@ def test_concat_tuple_keys(self): ) tm.assert_frame_equal(results, expected) - def test_join_str_datetime(self): - str_dates = ["20120209", "20120222"] - dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] - - A = DataFrame(str_dates, index=range(2), columns=["aa"]) - C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates) - - tst = A.join(C, on="aa") - - assert len(tst.columns) == 3 - - def test_join_multiindex_leftright(self): - # GH 10741 - df1 = pd.DataFrame( - [ - ["a", "x", 0.471780], - ["a", "y", 0.774908], - ["a", "z", 0.563634], - ["b", "x", -0.353756], - ["b", "y", 0.368062], - ["b", "z", -1.721840], - ["c", "x", 1], - ["c", "y", 2], - ["c", "z", 3], - ], - columns=["first", "second", "value1"], - ).set_index(["first", "second"]) - - df2 = pd.DataFrame( - [["a", 10], ["b", 20]], columns=["first", "value2"] - ).set_index(["first"]) - - exp = pd.DataFrame( - [ - [0.471780, 10], - [0.774908, 10], - [0.563634, 10], - [-0.353756, 20], - [0.368062, 20], - [-1.721840, 20], - [1.000000, np.nan], - [2.000000, np.nan], - [3.000000, np.nan], - ], - index=df1.index, - columns=["value1", "value2"], - ) - - # these must be the same results (but columns are flipped) - tm.assert_frame_equal(df1.join(df2, how="left"), exp) - tm.assert_frame_equal(df2.join(df1, how="right"), exp[["value2", "value1"]]) - - exp_idx = pd.MultiIndex.from_product( - [["a", "b"], ["x", "y", "z"]], names=["first", "second"] - ) - exp = pd.DataFrame( - [ - [0.471780, 10], - [0.774908, 10], - [0.563634, 10], - [-0.353756, 20], - [0.368062, 20], - [-1.721840, 20], - ], - index=exp_idx, - columns=["value1", "value2"], - ) - - tm.assert_frame_equal(df1.join(df2, how="right"), exp) - tm.assert_frame_equal(df2.join(df1, how="left"), exp[["value2", "value1"]]) - def test_concat_named_keys(self): # GH 14252 df = pd.DataFrame({"foo": [1, 2], "bar": [0.1, 0.2]}) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 14162bc433317..baac87755c6d2 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2,13 +2,16 @@ from datetime import date, datetime, timedelta import functools import itertools +import re import numpy as np import numpy.ma as ma import numpy.ma.mrecords as mrecords import pytest +import pytz -from pandas.compat import is_platform_little_endian +from pandas.compat import PY37, is_platform_little_endian +from pandas.compat.numpy import _is_numpy_dev from pandas.core.dtypes.common import is_integer_dtype @@ -45,15 +48,15 @@ class TestDataFrameConstructors: def test_series_with_name_not_matching_column(self): # GH#9232 - x = pd.Series(range(5), name=1) - y = pd.Series(range(5), name=0) + x = Series(range(5), name=1) + y = Series(range(5), name=0) - result = pd.DataFrame(x, columns=[0]) - expected = pd.DataFrame([], columns=[0]) + result = DataFrame(x, columns=[0]) + expected = DataFrame([], columns=[0]) tm.assert_frame_equal(result, expected) - result = pd.DataFrame(y, columns=[1]) - expected = pd.DataFrame([], columns=[1]) + result = DataFrame(y, columns=[1]) + expected = DataFrame([], columns=[1]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -124,7 +127,7 @@ def test_constructor_cast_failure(self): def test_constructor_dtype_copy(self): orig_df = DataFrame({"col1": [1.0], "col2": [2.0], "col3": [3.0]}) - new_df = pd.DataFrame(orig_df, dtype=float, copy=True) + new_df = DataFrame(orig_df, dtype=float, copy=True) new_df["col1"] = 200.0 assert orig_df["col1"][0] == 1.0 @@ -144,6 +147,7 @@ def test_constructor_dtype_list_data(self): assert df.loc[1, 0] is None assert df.loc[0, 1] == "2" + @pytest.mark.xfail(_is_numpy_dev, reason="Interprets list of frame as 3D") def test_constructor_list_frames(self): # see gh-3243 result = DataFrame([DataFrame()]) @@ -217,10 +221,10 @@ def test_constructor_rec(self, float_frame): index = float_frame.index df = DataFrame(rec) - tm.assert_index_equal(df.columns, pd.Index(rec.dtype.names)) + tm.assert_index_equal(df.columns, Index(rec.dtype.names)) df2 = DataFrame(rec, index=index) - tm.assert_index_equal(df2.columns, pd.Index(rec.dtype.names)) + tm.assert_index_equal(df2.columns, Index(rec.dtype.names)) tm.assert_index_equal(df2.index, index) rng = np.arange(len(rec))[::-1] @@ -295,7 +299,7 @@ def test_constructor_dict(self): tm.assert_series_equal(frame["col1"], datetime_series.rename("col1")) - exp = pd.Series( + exp = Series( np.concatenate([[np.nan] * 5, datetime_series_short.values]), index=datetime_series.index, name="col2", @@ -322,7 +326,7 @@ def test_constructor_dict(self): # Length-one dict micro-optimization frame = DataFrame({"A": {"1": 1, "2": 2}}) - tm.assert_index_equal(frame.index, pd.Index(["1", "2"])) + tm.assert_index_equal(frame.index, Index(["1", "2"])) # empty dict plus index idx = Index([0, 1, 2]) @@ -415,8 +419,8 @@ def test_constructor_dict_order_insertion(self): def test_constructor_dict_nan_key_and_columns(self): # GH 16894 - result = pd.DataFrame({np.nan: [1, 2], 2: [2, 3]}, columns=[np.nan, 2]) - expected = pd.DataFrame([[1, 2], [2, 3]], columns=[np.nan, 2]) + result = DataFrame({np.nan: [1, 2], 2: [2, 3]}, columns=[np.nan, 2]) + expected = DataFrame([[1, 2], [2, 3]], columns=[np.nan, 2]) tm.assert_frame_equal(result, expected) def test_constructor_multi_index(self): @@ -425,29 +429,29 @@ def test_constructor_multi_index(self): tuples = [(2, 3), (3, 3), (3, 3)] mi = MultiIndex.from_tuples(tuples) df = DataFrame(index=mi, columns=mi) - assert pd.isna(df).values.ravel().all() + assert isna(df).values.ravel().all() tuples = [(3, 3), (2, 3), (3, 3)] mi = MultiIndex.from_tuples(tuples) df = DataFrame(index=mi, columns=mi) - assert pd.isna(df).values.ravel().all() + assert isna(df).values.ravel().all() def test_constructor_2d_index(self): # GH 25416 # handling of 2d index in construction - df = pd.DataFrame([[1]], columns=[[1]], index=[1, 2]) - expected = pd.DataFrame( + df = DataFrame([[1]], columns=[[1]], index=[1, 2]) + expected = DataFrame( [1, 1], index=pd.Int64Index([1, 2], dtype="int64"), - columns=pd.MultiIndex(levels=[[1]], codes=[[0]]), + columns=MultiIndex(levels=[[1]], codes=[[0]]), ) tm.assert_frame_equal(df, expected) - df = pd.DataFrame([[1]], columns=[[1]], index=[[1, 2]]) - expected = pd.DataFrame( + df = DataFrame([[1]], columns=[[1]], index=[[1, 2]]) + expected = DataFrame( [1, 1], - index=pd.MultiIndex(levels=[[1, 2]], codes=[[0, 1]]), - columns=pd.MultiIndex(levels=[[1]], codes=[[0]]), + index=MultiIndex(levels=[[1, 2]], codes=[[0, 1]]), + columns=MultiIndex(levels=[[1]], codes=[[0]]), ) tm.assert_frame_equal(df, expected) @@ -468,7 +472,7 @@ def test_constructor_error_msgs(self): DataFrame( np.arange(12).reshape((4, 3)), columns=["foo", "bar", "baz"], - index=pd.date_range("2000-01-01", periods=3), + index=date_range("2000-01-01", periods=3), ) arr = np.array([[4, 5, 6]]) @@ -503,6 +507,7 @@ def test_constructor_error_msgs(self): with pytest.raises(ValueError, match=msg): DataFrame({"a": False, "b": True}) + @pytest.mark.xfail(_is_numpy_dev, reason="Interprets embedded frame as 3D") def test_constructor_with_embedded_frames(self): # embedded data frames @@ -709,14 +714,12 @@ def test_constructor_period(self): # PeriodIndex a = pd.PeriodIndex(["2012-01", "NaT", "2012-04"], freq="M") b = pd.PeriodIndex(["2012-02-01", "2012-03-01", "NaT"], freq="D") - df = pd.DataFrame({"a": a, "b": b}) + df = DataFrame({"a": a, "b": b}) assert df["a"].dtype == a.dtype assert df["b"].dtype == b.dtype # list of periods - df = pd.DataFrame( - {"a": a.astype(object).tolist(), "b": b.astype(object).tolist()} - ) + df = DataFrame({"a": a.astype(object).tolist(), "b": b.astype(object).tolist()}) assert df["a"].dtype == a.dtype assert df["b"].dtype == b.dtype @@ -878,8 +881,8 @@ def test_constructor_maskedarray_nonfloat(self): def test_constructor_maskedarray_hardened(self): # Check numpy masked arrays with hard masks -- from GH24574 mat_hard = ma.masked_all((2, 2), dtype=float).harden_mask() - result = pd.DataFrame(mat_hard, columns=["A", "B"], index=[1, 2]) - expected = pd.DataFrame( + result = DataFrame(mat_hard, columns=["A", "B"], index=[1, 2]) + expected = DataFrame( {"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, columns=["A", "B"], index=[1, 2], @@ -888,8 +891,8 @@ def test_constructor_maskedarray_hardened(self): tm.assert_frame_equal(result, expected) # Check case where mask is hard but no data are masked mat_hard = ma.ones((2, 2), dtype=float).harden_mask() - result = pd.DataFrame(mat_hard, columns=["A", "B"], index=[1, 2]) - expected = pd.DataFrame( + result = DataFrame(mat_hard, columns=["A", "B"], index=[1, 2]) + expected = DataFrame( {"A": [1.0, 1.0], "B": [1.0, 1.0]}, columns=["A", "B"], index=[1, 2], @@ -903,8 +906,8 @@ def test_constructor_maskedrecarray_dtype(self): np.ma.zeros(5, dtype=[("date", " error if len(indexer) == 0: - msg = ( - "cannot do label indexing on RangeIndex " - r"with these indexers \[nan\] of type float" - ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(KeyError, match="^nan$"): df.loc[:, np.nan] # single nan should result in Series elif len(indexer) == 1: @@ -1984,7 +2048,7 @@ def test_from_records_to_records(self): # TODO(wesm): unused frame = DataFrame.from_records(arr) # noqa - index = pd.Index(np.arange(len(arr))[::-1]) + index = Index(np.arange(len(arr))[::-1]) indexed_frame = DataFrame.from_records(arr, index=index) tm.assert_index_equal(indexed_frame.index, index) @@ -2283,7 +2347,7 @@ def test_from_records_sequencelike(self): # empty case result = DataFrame.from_records([], columns=["foo", "bar", "baz"]) assert len(result) == 0 - tm.assert_index_equal(result.columns, pd.Index(["foo", "bar", "baz"])) + tm.assert_index_equal(result.columns, Index(["foo", "bar", "baz"])) result = DataFrame.from_records([]) assert len(result) == 0 @@ -2386,6 +2450,12 @@ def test_from_records_series_list_dict(self): result = DataFrame.from_records(data) tm.assert_frame_equal(result, expected) + def test_frame_from_records_utc(self): + rec = {"datum": 1.5, "begin_time": datetime(2006, 4, 27, tzinfo=pytz.utc)} + + # it works + DataFrame.from_records([rec], index="begin_time") + def test_to_frame_with_falsey_names(self): # GH 16114 result = Series(name=0, dtype=object).to_frame().dtypes @@ -2436,20 +2506,20 @@ def test_datetime_date_tuple_columns_from_dict(self): v = date.today() tup = v, v result = DataFrame({tup: Series(range(3), index=range(3))}, columns=[tup]) - expected = DataFrame([0, 1, 2], columns=pd.Index(pd.Series([tup]))) + expected = DataFrame([0, 1, 2], columns=Index(Series([tup]))) tm.assert_frame_equal(result, expected) def test_construct_with_two_categoricalindex_series(self): # GH 14600 - s1 = pd.Series( + s1 = Series( [39, 6, 4], index=pd.CategoricalIndex(["female", "male", "unknown"]) ) - s2 = pd.Series( + s2 = Series( [2, 152, 2, 242, 150], index=pd.CategoricalIndex(["f", "female", "m", "male", "unknown"]), ) - result = pd.DataFrame([s1, s2]) - expected = pd.DataFrame( + result = DataFrame([s1, s2]) + expected = DataFrame( np.array( [[np.nan, 39.0, np.nan, 6.0, 4.0], [2.0, 152.0, 2.0, 242.0, 150.0]] ), @@ -2457,6 +2527,18 @@ def test_construct_with_two_categoricalindex_series(self): ) tm.assert_frame_equal(result, expected) + def test_from_M8_structured(self): + dates = [(datetime(2012, 9, 9, 0, 0), datetime(2012, 9, 8, 15, 10))] + arr = np.array(dates, dtype=[("Date", "M8[us]"), ("Forecasting", "M8[us]")]) + df = DataFrame(arr) + + assert df["Date"][0] == dates[0][0] + assert df["Forecasting"][0] == dates[0][1] + + s = Series(arr["Date"]) + assert isinstance(s[0], Timestamp) + assert s[0] == dates[0][0] + class TestDataFrameConstructorWithDatetimeTZ: def test_from_dict(self): @@ -2536,19 +2618,19 @@ def test_nested_dict_construction(self): "Nevada": {2001: 2.4, 2002: 2.9}, "Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6}, } - result = pd.DataFrame(pop, index=[2001, 2002, 2003], columns=columns) - expected = pd.DataFrame( + result = DataFrame(pop, index=[2001, 2002, 2003], columns=columns) + expected = DataFrame( [(2.4, 1.7), (2.9, 3.6), (np.nan, np.nan)], columns=columns, - index=pd.Index([2001, 2002, 2003]), + index=Index([2001, 2002, 2003]), ) tm.assert_frame_equal(result, expected) def test_from_tzaware_object_array(self): # GH#26825 2D object array of tzaware timestamps should not raise - dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") + dti = date_range("2016-04-05 04:30", periods=3, tz="UTC") data = dti._data.astype(object).reshape(1, -1) - df = pd.DataFrame(data) + df = DataFrame(data) assert df.shape == (1, 3) assert (df.dtypes == dti.dtype).all() assert (df == dti).all().all() @@ -2587,7 +2669,13 @@ def test_from_tzaware_mixed_object_array(self): def test_from_2d_ndarray_with_dtype(self): # GH#12513 array_dim2 = np.arange(10).reshape((5, 2)) - df = pd.DataFrame(array_dim2, dtype="datetime64[ns, UTC]") + df = DataFrame(array_dim2, dtype="datetime64[ns, UTC]") - expected = pd.DataFrame(array_dim2).astype("datetime64[ns, UTC]") + expected = DataFrame(array_dim2).astype("datetime64[ns, UTC]") tm.assert_frame_equal(df, expected) + + def test_construction_from_set_raises(self): + # https://github.com/pandas-dev/pandas/issues/32582 + msg = "Set type is unordered" + with pytest.raises(TypeError, match=msg): + pd.DataFrame({"a": {1, 2, 3}}) diff --git a/pandas/tests/frame/test_cumulative.py b/pandas/tests/frame/test_cumulative.py index 486cbfb2761e0..1b7e70dd28c63 100644 --- a/pandas/tests/frame/test_cumulative.py +++ b/pandas/tests/frame/test_cumulative.py @@ -23,9 +23,9 @@ def test_cumsum_corner(self): result = dm.cumsum() # noqa def test_cumsum(self, datetime_frame): - datetime_frame.loc[5:10, 0] = np.nan - datetime_frame.loc[10:15, 1] = np.nan - datetime_frame.loc[15:, 2] = np.nan + datetime_frame.iloc[5:10, 0] = np.nan + datetime_frame.iloc[10:15, 1] = np.nan + datetime_frame.iloc[15:, 2] = np.nan # axis = 0 cumsum = datetime_frame.cumsum() @@ -46,9 +46,9 @@ def test_cumsum(self, datetime_frame): assert np.shape(cumsum_xs) == np.shape(datetime_frame) def test_cumprod(self, datetime_frame): - datetime_frame.loc[5:10, 0] = np.nan - datetime_frame.loc[10:15, 1] = np.nan - datetime_frame.loc[15:, 2] = np.nan + datetime_frame.iloc[5:10, 0] = np.nan + datetime_frame.iloc[10:15, 1] = np.nan + datetime_frame.iloc[15:, 2] = np.nan # axis = 0 cumprod = datetime_frame.cumprod() @@ -80,9 +80,9 @@ def test_cumprod(self, datetime_frame): strict=False, ) def test_cummin(self, datetime_frame): - datetime_frame.loc[5:10, 0] = np.nan - datetime_frame.loc[10:15, 1] = np.nan - datetime_frame.loc[15:, 2] = np.nan + datetime_frame.iloc[5:10, 0] = np.nan + datetime_frame.iloc[10:15, 1] = np.nan + datetime_frame.iloc[15:, 2] = np.nan # axis = 0 cummin = datetime_frame.cummin() @@ -108,9 +108,9 @@ def test_cummin(self, datetime_frame): strict=False, ) def test_cummax(self, datetime_frame): - datetime_frame.loc[5:10, 0] = np.nan - datetime_frame.loc[10:15, 1] = np.nan - datetime_frame.loc[15:, 2] = np.nan + datetime_frame.iloc[5:10, 0] = np.nan + datetime_frame.iloc[10:15, 1] = np.nan + datetime_frame.iloc[15:, 2] = np.nan # axis = 0 cummax = datetime_frame.cummax() diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 713d8f3ceeedb..27ebee4aaaccf 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -1,5 +1,6 @@ from collections import OrderedDict from datetime import timedelta +import re import numpy as np import pytest @@ -247,12 +248,7 @@ def test_astype_str(self): { "a": list(map(str, map(lambda x: Timestamp(x)._date_repr, a._values))), "b": list(map(str, map(Timestamp, b._values))), - "c": list( - map( - str, - map(lambda x: Timedelta(x)._repr_base(format="all"), c._values), - ) - ), + "c": list(map(lambda x: Timedelta(x)._repr_base(), c._values)), "d": list(map(str, d._values)), "e": list(map(str, e._values)), } @@ -452,22 +448,6 @@ def test_astype_extension_dtypes_duplicate_col(self, dtype): expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("kwargs", [dict(), dict(other=None)]) - def test_df_where_with_category(self, kwargs): - # GH 16979 - df = DataFrame(np.arange(2 * 3).reshape(2, 3), columns=list("ABC")) - mask = np.array([[True, False, True], [False, True, True]]) - - # change type to category - df.A = df.A.astype("category") - df.B = df.B.astype("category") - df.C = df.C.astype("category") - - result = df.A.where(mask[:, 0], **kwargs) - expected = Series(pd.Categorical([0, np.nan], categories=[0, 3]), name="A") - - tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( "dtype", [{100: "float64", 200: "uint64"}, "category", "float64"] ) @@ -478,34 +458,9 @@ def test_astype_column_metadata(self, dtype): df = df.astype(dtype) tm.assert_index_equal(df.columns, columns) - def test_df_where_change_dtype(self): - # GH 16979 - df = DataFrame(np.arange(2 * 3).reshape(2, 3), columns=list("ABC")) - mask = np.array([[True, False, False], [False, False, True]]) - - result = df.where(mask) - expected = DataFrame( - [[0, np.nan, np.nan], [np.nan, np.nan, 5]], columns=list("ABC") - ) - - tm.assert_frame_equal(result, expected) - - # change type to category - df.A = df.A.astype("category") - df.B = df.B.astype("category") - df.C = df.C.astype("category") - - result = df.where(mask) - A = pd.Categorical([0, np.nan], categories=[0, 3]) - B = pd.Categorical([np.nan, np.nan], categories=[1, 4]) - C = pd.Categorical([np.nan, 5], categories=[2, 5]) - expected = DataFrame({"A": A, "B": B, "C": C}) - - tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) - def test_astype_from_datetimelike_to_objectt(self, dtype, unit): + def test_astype_from_datetimelike_to_object(self, dtype, unit): # tests astype to object dtype # gh-19223 / gh-12425 dtype = f"{dtype}[{unit}]" @@ -636,7 +591,11 @@ def test_arg_for_errors_in_astype(self): df = DataFrame([1, 2, 3]) - with pytest.raises(ValueError): + msg = ( + "Expected value of kwarg 'errors' to be one of " + "['raise', 'ignore']. Supplied value is 'True'" + ) + with pytest.raises(ValueError, match=re.escape(msg)): df.astype(np.float64, errors=True) df.astype(np.int8, errors="ignore") diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py index 8c388a887158f..4d6e675c6765f 100644 --- a/pandas/tests/frame/test_join.py +++ b/pandas/tests/frame/test_join.py @@ -1,6 +1,9 @@ +from datetime import datetime + import numpy as np import pytest +import pandas as pd from pandas import DataFrame, Index, period_range import pandas._testing as tm @@ -216,3 +219,76 @@ def test_suppress_future_warning_with_sort_kw(sort_kw): with tm.assert_produces_warning(None, check_stacklevel=False): result = a.join([b, c], how="outer", sort=sort_kw) tm.assert_frame_equal(result, expected) + + +class TestDataFrameJoin: + def test_join_str_datetime(self): + str_dates = ["20120209", "20120222"] + dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] + + A = DataFrame(str_dates, index=range(2), columns=["aa"]) + C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates) + + tst = A.join(C, on="aa") + + assert len(tst.columns) == 3 + + def test_join_multiindex_leftright(self): + # GH 10741 + df1 = pd.DataFrame( + [ + ["a", "x", 0.471780], + ["a", "y", 0.774908], + ["a", "z", 0.563634], + ["b", "x", -0.353756], + ["b", "y", 0.368062], + ["b", "z", -1.721840], + ["c", "x", 1], + ["c", "y", 2], + ["c", "z", 3], + ], + columns=["first", "second", "value1"], + ).set_index(["first", "second"]) + + df2 = pd.DataFrame( + [["a", 10], ["b", 20]], columns=["first", "value2"] + ).set_index(["first"]) + + exp = pd.DataFrame( + [ + [0.471780, 10], + [0.774908, 10], + [0.563634, 10], + [-0.353756, 20], + [0.368062, 20], + [-1.721840, 20], + [1.000000, np.nan], + [2.000000, np.nan], + [3.000000, np.nan], + ], + index=df1.index, + columns=["value1", "value2"], + ) + + # these must be the same results (but columns are flipped) + tm.assert_frame_equal(df1.join(df2, how="left"), exp) + tm.assert_frame_equal(df2.join(df1, how="right"), exp[["value2", "value1"]]) + + exp_idx = pd.MultiIndex.from_product( + [["a", "b"], ["x", "y", "z"]], names=["first", "second"] + ) + exp = pd.DataFrame( + [ + [0.471780, 10], + [0.774908, 10], + [0.563634, 10], + [-0.353756, 20], + [0.368062, 20], + [-1.721840, 20], + ], + index=exp_idx, + columns=["value1", "value2"], + ) + + tm.assert_frame_equal(df1.join(df2, how="right"), exp) + tm.assert_frame_equal(df2.join(df1, how="left"), exp[["value2", "value1"]]) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 196df8ba00476..7cb7115276f71 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -114,7 +114,7 @@ def test_dropna(self): tm.assert_frame_equal(dropped, expected) # bad input - msg = "No axis named 3 for object type " + msg = "No axis named 3 for object type DataFrame" with pytest.raises(ValueError, match=msg): df.dropna(axis=3) @@ -372,8 +372,12 @@ def test_fillna_categorical_nan(self): cat = Categorical([np.nan, 2, np.nan]) val = Categorical([np.nan, np.nan, np.nan]) df = DataFrame({"cats": cat, "vals": val}) - with tm.assert_produces_warning(RuntimeWarning): - res = df.fillna(df.median()) + + # GH#32950 df.median() is poorly behaved because there is no + # Categorical.median + median = Series({"cats": 2.0, "vals": np.nan}) + + res = df.fillna(median) v_exp = [np.nan, np.nan, np.nan] df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, dtype="category") tm.assert_frame_equal(res, df_exp) @@ -694,12 +698,3 @@ def test_fill_corner(self, float_frame, float_string_frame): # TODO(wesm): unused? result = empty_float.fillna(value=0) # noqa - - def test_fill_value_when_combine_const(self): - # GH12723 - dat = np.array([0, 1, np.nan, 3, 4, 5], dtype="float") - df = DataFrame({"foo": dat}, index=range(6)) - - exp = df.fillna(0).add(2) - res = df.add(2, fill_value=0) - tm.assert_frame_equal(res, exp) diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py deleted file mode 100644 index 33f71602f4713..0000000000000 --- a/pandas/tests/frame/test_mutate_columns.py +++ /dev/null @@ -1,186 +0,0 @@ -import re - -import numpy as np -import pytest - -from pandas import DataFrame, Index, MultiIndex, Series -import pandas._testing as tm - -# Column add, remove, delete. - - -class TestDataFrameMutateColumns: - def test_insert_error_msmgs(self): - - # GH 7432 - df = DataFrame( - {"foo": ["a", "b", "c"], "bar": [1, 2, 3], "baz": ["d", "e", "f"]} - ).set_index("foo") - s = DataFrame( - {"foo": ["a", "b", "c", "a"], "fiz": ["g", "h", "i", "j"]} - ).set_index("foo") - msg = "cannot reindex from a duplicate axis" - with pytest.raises(ValueError, match=msg): - df["newcol"] = s - - # GH 4107, more descriptive error message - df = DataFrame(np.random.randint(0, 2, (4, 4)), columns=["a", "b", "c", "d"]) - - msg = "incompatible index of inserted column with frame index" - with pytest.raises(TypeError, match=msg): - df["gr"] = df.groupby(["b", "c"]).count() - - def test_insert_benchmark(self): - # from the vb_suite/frame_methods/frame_insert_columns - N = 10 - K = 5 - df = DataFrame(index=range(N)) - new_col = np.random.randn(N) - for i in range(K): - df[i] = new_col - expected = DataFrame(np.repeat(new_col, K).reshape(N, K), index=range(N)) - tm.assert_frame_equal(df, expected) - - def test_insert(self): - df = DataFrame( - np.random.randn(5, 3), index=np.arange(5), columns=["c", "b", "a"] - ) - - df.insert(0, "foo", df["a"]) - tm.assert_index_equal(df.columns, Index(["foo", "c", "b", "a"])) - tm.assert_series_equal(df["a"], df["foo"], check_names=False) - - df.insert(2, "bar", df["c"]) - tm.assert_index_equal(df.columns, Index(["foo", "c", "bar", "b", "a"])) - tm.assert_almost_equal(df["c"], df["bar"], check_names=False) - - # diff dtype - - # new item - df["x"] = df["a"].astype("float32") - result = df.dtypes - expected = Series( - [np.dtype("float64")] * 5 + [np.dtype("float32")], - index=["foo", "c", "bar", "b", "a", "x"], - ) - tm.assert_series_equal(result, expected) - - # replacing current (in different block) - df["a"] = df["a"].astype("float32") - result = df.dtypes - expected = Series( - [np.dtype("float64")] * 4 + [np.dtype("float32")] * 2, - index=["foo", "c", "bar", "b", "a", "x"], - ) - tm.assert_series_equal(result, expected) - - df["y"] = df["a"].astype("int32") - result = df.dtypes - expected = Series( - [np.dtype("float64")] * 4 + [np.dtype("float32")] * 2 + [np.dtype("int32")], - index=["foo", "c", "bar", "b", "a", "x", "y"], - ) - tm.assert_series_equal(result, expected) - - with pytest.raises(ValueError, match="already exists"): - df.insert(1, "a", df["b"]) - msg = "cannot insert c, already exists" - with pytest.raises(ValueError, match=msg): - df.insert(1, "c", df["b"]) - - df.columns.name = "some_name" - # preserve columns name field - df.insert(0, "baz", df["c"]) - assert df.columns.name == "some_name" - - # GH 13522 - df = DataFrame(index=["A", "B", "C"]) - df["X"] = df.index - df["X"] = ["x", "y", "z"] - exp = DataFrame(data={"X": ["x", "y", "z"]}, index=["A", "B", "C"]) - tm.assert_frame_equal(df, exp) - - def test_delitem(self, float_frame): - del float_frame["A"] - assert "A" not in float_frame - - def test_delitem_multiindex(self): - midx = MultiIndex.from_product([["A", "B"], [1, 2]]) - df = DataFrame(np.random.randn(4, 4), columns=midx) - assert len(df.columns) == 4 - assert ("A",) in df.columns - assert "A" in df.columns - - result = df["A"] - assert isinstance(result, DataFrame) - del df["A"] - - assert len(df.columns) == 2 - - # A still in the levels, BUT get a KeyError if trying - # to delete - assert ("A",) not in df.columns - with pytest.raises(KeyError, match=re.escape("('A',)")): - del df[("A",)] - - # behavior of dropped/deleted MultiIndex levels changed from - # GH 2770 to GH 19027: MultiIndex no longer '.__contains__' - # levels which are dropped/deleted - assert "A" not in df.columns - with pytest.raises(KeyError, match=re.escape("('A',)")): - del df["A"] - - def test_pop(self, float_frame): - float_frame.columns.name = "baz" - - float_frame.pop("A") - assert "A" not in float_frame - - float_frame["foo"] = "bar" - float_frame.pop("foo") - assert "foo" not in float_frame - assert float_frame.columns.name == "baz" - - # gh-10912: inplace ops cause caching issue - a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"], index=["X", "Y"]) - b = a.pop("B") - b += 1 - - # original frame - expected = DataFrame([[1, 3], [4, 6]], columns=["A", "C"], index=["X", "Y"]) - tm.assert_frame_equal(a, expected) - - # result - expected = Series([2, 5], index=["X", "Y"], name="B") + 1 - tm.assert_series_equal(b, expected) - - def test_pop_non_unique_cols(self): - df = DataFrame({0: [0, 1], 1: [0, 1], 2: [4, 5]}) - df.columns = ["a", "b", "a"] - - res = df.pop("a") - assert type(res) == DataFrame - assert len(res) == 2 - assert len(df.columns) == 1 - assert "b" in df.columns - assert "a" not in df.columns - assert len(df.index) == 2 - - def test_insert_column_bug_4032(self): - - # GH4032, inserting a column and renaming causing errors - df = DataFrame({"b": [1.1, 2.2]}) - df = df.rename(columns={}) - df.insert(0, "a", [1, 2]) - - result = df.rename(columns={}) - str(result) - expected = DataFrame([[1, 1.1], [2, 2.2]], columns=["a", "b"]) - tm.assert_frame_equal(result, expected) - df.insert(0, "c", [1.3, 2.3]) - - result = df.rename(columns={}) - str(result) - - expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], columns=["c", "a", "b"]) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index 32ead406a3e86..a8b76f4d85f49 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -474,8 +474,8 @@ def test_columns_with_dups(self): ) df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1) - assert len(df._data._blknos) == len(df.columns) - assert len(df._data._blklocs) == len(df.columns) + assert len(df._mgr.blknos) == len(df.columns) + assert len(df._mgr.blklocs) == len(df.columns) # testing iloc for i in range(len(df.columns)): @@ -513,14 +513,3 @@ def test_set_value_by_index(self): df.iloc[:, 0] = 3 tm.assert_series_equal(df.iloc[:, 1], expected) - - def test_insert_with_columns_dups(self): - # GH 14291 - df = pd.DataFrame() - df.insert(0, "A", ["g", "h", "i"], allow_duplicates=True) - df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True) - df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True) - exp = pd.DataFrame( - [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"] - ) - tm.assert_frame_equal(df, exp) diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index 542d9835bb5d3..fede1ca23a8ce 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -1,14 +1,13 @@ from decimal import Decimal import operator +import re import numpy as np import pytest import pandas as pd -from pandas import DataFrame, MultiIndex, Series +from pandas import DataFrame, Series import pandas._testing as tm -import pandas.core.common as com -from pandas.tests.frame.common import _check_mixed_float class TestDataFrameUnaryOperators: @@ -51,9 +50,13 @@ def test_neg_object(self, df, expected): ], ) def test_neg_raises(self, df): - with pytest.raises(TypeError): + msg = ( + "bad operand type for unary -: 'str'|" + r"Unary negative expects numeric dtype, not datetime64\[ns\]" + ) + with pytest.raises(TypeError, match=msg): (-df) - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): (-df["a"]) def test_invert(self, float_frame): @@ -116,9 +119,10 @@ def test_pos_object(self, df): "df", [pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])})] ) def test_pos_raises(self, df): - with pytest.raises(TypeError): + msg = re.escape("Unary plus expects numeric dtype, not datetime64[ns]") + with pytest.raises(TypeError, match=msg): (+df) - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): (+df["a"]) @@ -173,12 +177,14 @@ def test_logical_ops_invalid(self): df1 = DataFrame(1.0, index=[1], columns=["A"]) df2 = DataFrame(True, index=[1], columns=["A"]) - with pytest.raises(TypeError): + msg = re.escape("unsupported operand type(s) for |: 'float' and 'bool'") + with pytest.raises(TypeError, match=msg): df1 | df2 df1 = DataFrame("foo", index=[1], columns=["A"]) df2 = DataFrame(True, index=[1], columns=["A"]) - with pytest.raises(TypeError): + msg = re.escape("unsupported operand type(s) for |: 'str' and 'bool'") + with pytest.raises(TypeError, match=msg): df1 | df2 def test_logical_operators(self): @@ -274,620 +280,3 @@ def test_logical_operators_nans(self, left, right, op, expected): expected = DataFrame(expected) tm.assert_frame_equal(result, expected) - - -class TestDataFrameOperators: - @pytest.mark.parametrize( - "op", [operator.add, operator.sub, operator.mul, operator.truediv] - ) - def test_operators_none_as_na(self, op): - df = DataFrame( - {"col1": [2, 5.0, 123, None], "col2": [1, 2, 3, 4]}, dtype=object - ) - - # since filling converts dtypes from object, changed expected to be - # object - filled = df.fillna(np.nan) - result = op(df, 3) - expected = op(filled, 3).astype(object) - expected[com.isna(expected)] = None - tm.assert_frame_equal(result, expected) - - result = op(df, df) - expected = op(filled, filled).astype(object) - expected[com.isna(expected)] = None - tm.assert_frame_equal(result, expected) - - result = op(df, df.fillna(7)) - tm.assert_frame_equal(result, expected) - - result = op(df.fillna(7), df) - tm.assert_frame_equal(result, expected, check_dtype=False) - - @pytest.mark.parametrize("op,res", [("__eq__", False), ("__ne__", True)]) - # TODO: not sure what's correct here. - @pytest.mark.filterwarnings("ignore:elementwise:FutureWarning") - def test_logical_typeerror_with_non_valid(self, op, res, float_frame): - # we are comparing floats vs a string - result = getattr(float_frame, op)("foo") - assert bool(result.all().all()) is res - - def test_binary_ops_align(self): - - # test aligning binary ops - - # GH 6681 - index = MultiIndex.from_product( - [list("abc"), ["one", "two", "three"], [1, 2, 3]], - names=["first", "second", "third"], - ) - - df = DataFrame( - np.arange(27 * 3).reshape(27, 3), - index=index, - columns=["value1", "value2", "value3"], - ).sort_index() - - idx = pd.IndexSlice - for op in ["add", "sub", "mul", "div", "truediv"]: - opa = getattr(operator, op, None) - if opa is None: - continue - - x = Series([1.0, 10.0, 100.0], [1, 2, 3]) - result = getattr(df, op)(x, level="third", axis=0) - - expected = pd.concat( - [opa(df.loc[idx[:, :, i], :], v) for i, v in x.items()] - ).sort_index() - tm.assert_frame_equal(result, expected) - - x = Series([1.0, 10.0], ["two", "three"]) - result = getattr(df, op)(x, level="second", axis=0) - - expected = ( - pd.concat([opa(df.loc[idx[:, i], :], v) for i, v in x.items()]) - .reindex_like(df) - .sort_index() - ) - tm.assert_frame_equal(result, expected) - - # GH9463 (alignment level of dataframe with series) - - midx = MultiIndex.from_product([["A", "B"], ["a", "b"]]) - df = DataFrame(np.ones((2, 4), dtype="int64"), columns=midx) - s = pd.Series({"a": 1, "b": 2}) - - df2 = df.copy() - df2.columns.names = ["lvl0", "lvl1"] - s2 = s.copy() - s2.index.name = "lvl1" - - # different cases of integer/string level names: - res1 = df.mul(s, axis=1, level=1) - res2 = df.mul(s2, axis=1, level=1) - res3 = df2.mul(s, axis=1, level=1) - res4 = df2.mul(s2, axis=1, level=1) - res5 = df2.mul(s, axis=1, level="lvl1") - res6 = df2.mul(s2, axis=1, level="lvl1") - - exp = DataFrame( - np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype="int64"), columns=midx - ) - - for res in [res1, res2]: - tm.assert_frame_equal(res, exp) - - exp.columns.names = ["lvl0", "lvl1"] - for res in [res3, res4, res5, res6]: - tm.assert_frame_equal(res, exp) - - def test_dti_tz_convert_to_utc(self): - base = pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], tz="UTC") - idx1 = base.tz_convert("Asia/Tokyo")[:2] - idx2 = base.tz_convert("US/Eastern")[1:] - - df1 = DataFrame({"A": [1, 2]}, index=idx1) - df2 = DataFrame({"A": [1, 1]}, index=idx2) - exp = DataFrame({"A": [np.nan, 3, np.nan]}, index=base) - tm.assert_frame_equal(df1 + df2, exp) - - def test_combineFrame(self, float_frame, mixed_float_frame, mixed_int_frame): - frame_copy = float_frame.reindex(float_frame.index[::2]) - - del frame_copy["D"] - frame_copy["C"][:5] = np.nan - - added = float_frame + frame_copy - - indexer = added["A"].dropna().index - exp = (float_frame["A"] * 2).copy() - - tm.assert_series_equal(added["A"].dropna(), exp.loc[indexer]) - - exp.loc[~exp.index.isin(indexer)] = np.nan - tm.assert_series_equal(added["A"], exp.loc[added["A"].index]) - - assert np.isnan(added["C"].reindex(frame_copy.index)[:5]).all() - - # assert(False) - - assert np.isnan(added["D"]).all() - - self_added = float_frame + float_frame - tm.assert_index_equal(self_added.index, float_frame.index) - - added_rev = frame_copy + float_frame - assert np.isnan(added["D"]).all() - assert np.isnan(added_rev["D"]).all() - - # corner cases - - # empty - plus_empty = float_frame + DataFrame() - assert np.isnan(plus_empty.values).all() - - empty_plus = DataFrame() + float_frame - assert np.isnan(empty_plus.values).all() - - empty_empty = DataFrame() + DataFrame() - assert empty_empty.empty - - # out of order - reverse = float_frame.reindex(columns=float_frame.columns[::-1]) - - tm.assert_frame_equal(reverse + float_frame, float_frame * 2) - - # mix vs float64, upcast - added = float_frame + mixed_float_frame - _check_mixed_float(added, dtype="float64") - added = mixed_float_frame + float_frame - _check_mixed_float(added, dtype="float64") - - # mix vs mix - added = mixed_float_frame + mixed_float_frame - _check_mixed_float(added, dtype=dict(C=None)) - - # with int - added = float_frame + mixed_int_frame - _check_mixed_float(added, dtype="float64") - - def test_combine_series( - self, float_frame, mixed_float_frame, mixed_int_frame, datetime_frame - ): - - # Series - series = float_frame.xs(float_frame.index[0]) - - added = float_frame + series - - for key, s in added.items(): - tm.assert_series_equal(s, float_frame[key] + series[key]) - - larger_series = series.to_dict() - larger_series["E"] = 1 - larger_series = Series(larger_series) - larger_added = float_frame + larger_series - - for key, s in float_frame.items(): - tm.assert_series_equal(larger_added[key], s + series[key]) - assert "E" in larger_added - assert np.isnan(larger_added["E"]).all() - - # no upcast needed - added = mixed_float_frame + series - _check_mixed_float(added) - - # vs mix (upcast) as needed - added = mixed_float_frame + series.astype("float32") - _check_mixed_float(added, dtype=dict(C=None)) - added = mixed_float_frame + series.astype("float16") - _check_mixed_float(added, dtype=dict(C=None)) - - # FIXME: don't leave commented-out - # these raise with numexpr.....as we are adding an int64 to an - # uint64....weird vs int - - # added = mixed_int_frame + (100*series).astype('int64') - # _check_mixed_int(added, dtype = dict(A = 'int64', B = 'float64', C = - # 'int64', D = 'int64')) - # added = mixed_int_frame + (100*series).astype('int32') - # _check_mixed_int(added, dtype = dict(A = 'int32', B = 'float64', C = - # 'int32', D = 'int64')) - - # TimeSeries - ts = datetime_frame["A"] - - # 10890 - # we no longer allow auto timeseries broadcasting - # and require explicit broadcasting - added = datetime_frame.add(ts, axis="index") - - for key, col in datetime_frame.items(): - result = col + ts - tm.assert_series_equal(added[key], result, check_names=False) - assert added[key].name == key - if col.name == ts.name: - assert result.name == "A" - else: - assert result.name is None - - smaller_frame = datetime_frame[:-5] - smaller_added = smaller_frame.add(ts, axis="index") - - tm.assert_index_equal(smaller_added.index, datetime_frame.index) - - smaller_ts = ts[:-5] - smaller_added2 = datetime_frame.add(smaller_ts, axis="index") - tm.assert_frame_equal(smaller_added, smaller_added2) - - # length 0, result is all-nan - result = datetime_frame.add(ts[:0], axis="index") - expected = DataFrame( - np.nan, index=datetime_frame.index, columns=datetime_frame.columns - ) - tm.assert_frame_equal(result, expected) - - # Frame is all-nan - result = datetime_frame[:0].add(ts, axis="index") - expected = DataFrame( - np.nan, index=datetime_frame.index, columns=datetime_frame.columns - ) - tm.assert_frame_equal(result, expected) - - # empty but with non-empty index - frame = datetime_frame[:1].reindex(columns=[]) - result = frame.mul(ts, axis="index") - assert len(result) == len(ts) - - def test_combineFunc(self, float_frame, mixed_float_frame): - result = float_frame * 2 - tm.assert_numpy_array_equal(result.values, float_frame.values * 2) - - # vs mix - result = mixed_float_frame * 2 - for c, s in result.items(): - tm.assert_numpy_array_equal(s.values, mixed_float_frame[c].values * 2) - _check_mixed_float(result, dtype=dict(C=None)) - - result = DataFrame() * 2 - assert result.index.equals(DataFrame().index) - assert len(result.columns) == 0 - - def test_comparisons(self, simple_frame, float_frame): - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame() - - row = simple_frame.xs("a") - ndim_5 = np.ones(df1.shape + (1, 1, 1)) - - def test_comp(func): - result = func(df1, df2) - tm.assert_numpy_array_equal(result.values, func(df1.values, df2.values)) - - with pytest.raises(ValueError, match="dim must be <= 2"): - func(df1, ndim_5) - - result2 = func(simple_frame, row) - tm.assert_numpy_array_equal( - result2.values, func(simple_frame.values, row.values) - ) - - result3 = func(float_frame, 0) - tm.assert_numpy_array_equal(result3.values, func(float_frame.values, 0)) - - msg = "Can only compare identically-labeled DataFrame" - with pytest.raises(ValueError, match=msg): - func(simple_frame, simple_frame[:2]) - - test_comp(operator.eq) - test_comp(operator.ne) - test_comp(operator.lt) - test_comp(operator.gt) - test_comp(operator.ge) - test_comp(operator.le) - - def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne): - # GH 11565 - df = DataFrame( - {x: {"x": "foo", "y": "bar", "z": "baz"} for x in ["a", "b", "c"]} - ) - - f = getattr(operator, compare_operators_no_eq_ne) - with pytest.raises(TypeError): - f(df, 0) - - def test_comparison_protected_from_errstate(self): - missing_df = tm.makeDataFrame() - missing_df.iloc[0]["A"] = np.nan - with np.errstate(invalid="ignore"): - expected = missing_df.values < 0 - with np.errstate(invalid="raise"): - result = (missing_df < 0).values - tm.assert_numpy_array_equal(result, expected) - - def test_boolean_comparison(self): - - # GH 4576 - # boolean comparisons with a tuple/list give unexpected results - df = DataFrame(np.arange(6).reshape((3, 2))) - b = np.array([2, 2]) - b_r = np.atleast_2d([2, 2]) - b_c = b_r.T - lst = [2, 2, 2] - tup = tuple(lst) - - # gt - expected = DataFrame([[False, False], [False, True], [True, True]]) - result = df > b - tm.assert_frame_equal(result, expected) - - result = df.values > b - tm.assert_numpy_array_equal(result, expected.values) - - msg1d = "Unable to coerce to Series, length must be 2: given 3" - msg2d = "Unable to coerce to DataFrame, shape must be" - msg2db = "operands could not be broadcast together with shapes" - with pytest.raises(ValueError, match=msg1d): - # wrong shape - df > lst - - with pytest.raises(ValueError, match=msg1d): - # wrong shape - result = df > tup - - # broadcasts like ndarray (GH#23000) - result = df > b_r - tm.assert_frame_equal(result, expected) - - result = df.values > b_r - tm.assert_numpy_array_equal(result, expected.values) - - with pytest.raises(ValueError, match=msg2d): - df > b_c - - with pytest.raises(ValueError, match=msg2db): - df.values > b_c - - # == - expected = DataFrame([[False, False], [True, False], [False, False]]) - result = df == b - tm.assert_frame_equal(result, expected) - - with pytest.raises(ValueError, match=msg1d): - result = df == lst - - with pytest.raises(ValueError, match=msg1d): - result = df == tup - - # broadcasts like ndarray (GH#23000) - result = df == b_r - tm.assert_frame_equal(result, expected) - - result = df.values == b_r - tm.assert_numpy_array_equal(result, expected.values) - - with pytest.raises(ValueError, match=msg2d): - df == b_c - - assert df.values.shape != b_c.shape - - # with alignment - df = DataFrame( - np.arange(6).reshape((3, 2)), columns=list("AB"), index=list("abc") - ) - expected.index = df.index - expected.columns = df.columns - - with pytest.raises(ValueError, match=msg1d): - result = df == lst - - with pytest.raises(ValueError, match=msg1d): - result = df == tup - - def test_inplace_ops_alignment(self): - - # inplace ops / ops alignment - # GH 8511 - - columns = list("abcdefg") - X_orig = DataFrame( - np.arange(10 * len(columns)).reshape(-1, len(columns)), - columns=columns, - index=range(10), - ) - Z = 100 * X_orig.iloc[:, 1:-1].copy() - block1 = list("bedcf") - subs = list("bcdef") - - # add - X = X_orig.copy() - result1 = (X[block1] + Z).reindex(columns=subs) - - X[block1] += Z - result2 = X.reindex(columns=subs) - - X = X_orig.copy() - result3 = (X[block1] + Z[block1]).reindex(columns=subs) - - X[block1] += Z[block1] - result4 = X.reindex(columns=subs) - - tm.assert_frame_equal(result1, result2) - tm.assert_frame_equal(result1, result3) - tm.assert_frame_equal(result1, result4) - - # sub - X = X_orig.copy() - result1 = (X[block1] - Z).reindex(columns=subs) - - X[block1] -= Z - result2 = X.reindex(columns=subs) - - X = X_orig.copy() - result3 = (X[block1] - Z[block1]).reindex(columns=subs) - - X[block1] -= Z[block1] - result4 = X.reindex(columns=subs) - - tm.assert_frame_equal(result1, result2) - tm.assert_frame_equal(result1, result3) - tm.assert_frame_equal(result1, result4) - - def test_inplace_ops_identity(self): - - # GH 5104 - # make sure that we are actually changing the object - s_orig = Series([1, 2, 3]) - df_orig = DataFrame(np.random.randint(0, 5, size=10).reshape(-1, 5)) - - # no dtype change - s = s_orig.copy() - s2 = s - s += 1 - tm.assert_series_equal(s, s2) - tm.assert_series_equal(s_orig + 1, s) - assert s is s2 - assert s._data is s2._data - - df = df_orig.copy() - df2 = df - df += 1 - tm.assert_frame_equal(df, df2) - tm.assert_frame_equal(df_orig + 1, df) - assert df is df2 - assert df._data is df2._data - - # dtype change - s = s_orig.copy() - s2 = s - s += 1.5 - tm.assert_series_equal(s, s2) - tm.assert_series_equal(s_orig + 1.5, s) - - df = df_orig.copy() - df2 = df - df += 1.5 - tm.assert_frame_equal(df, df2) - tm.assert_frame_equal(df_orig + 1.5, df) - assert df is df2 - assert df._data is df2._data - - # mixed dtype - arr = np.random.randint(0, 10, size=5) - df_orig = DataFrame({"A": arr.copy(), "B": "foo"}) - df = df_orig.copy() - df2 = df - df["A"] += 1 - expected = DataFrame({"A": arr.copy() + 1, "B": "foo"}) - tm.assert_frame_equal(df, expected) - tm.assert_frame_equal(df2, expected) - assert df._data is df2._data - - df = df_orig.copy() - df2 = df - df["A"] += 1.5 - expected = DataFrame({"A": arr.copy() + 1.5, "B": "foo"}) - tm.assert_frame_equal(df, expected) - tm.assert_frame_equal(df2, expected) - assert df._data is df2._data - - @pytest.mark.parametrize( - "op", - [ - "add", - "and", - "div", - "floordiv", - "mod", - "mul", - "or", - "pow", - "sub", - "truediv", - "xor", - ], - ) - def test_inplace_ops_identity2(self, op): - - if op == "div": - return - - df = DataFrame({"a": [1.0, 2.0, 3.0], "b": [1, 2, 3]}) - - operand = 2 - if op in ("and", "or", "xor"): - # cannot use floats for boolean ops - df["a"] = [True, False, True] - - df_copy = df.copy() - iop = f"__i{op}__" - op = f"__{op}__" - - # no id change and value is correct - getattr(df, iop)(operand) - expected = getattr(df_copy, op)(operand) - tm.assert_frame_equal(df, expected) - expected = id(df) - assert id(df) == expected - - def test_alignment_non_pandas(self): - index = ["A", "B", "C"] - columns = ["X", "Y", "Z"] - df = pd.DataFrame(np.random.randn(3, 3), index=index, columns=columns) - - align = pd.core.ops._align_method_FRAME - for val in [ - [1, 2, 3], - (1, 2, 3), - np.array([1, 2, 3], dtype=np.int64), - range(1, 4), - ]: - - tm.assert_series_equal( - align(df, val, "index")[1], Series([1, 2, 3], index=df.index) - ) - tm.assert_series_equal( - align(df, val, "columns")[1], Series([1, 2, 3], index=df.columns) - ) - - # length mismatch - msg = "Unable to coerce to Series, length must be 3: given 2" - for val in [[1, 2], (1, 2), np.array([1, 2]), range(1, 3)]: - - with pytest.raises(ValueError, match=msg): - align(df, val, "index") - - with pytest.raises(ValueError, match=msg): - align(df, val, "columns") - - val = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - tm.assert_frame_equal( - align(df, val, "index")[1], - DataFrame(val, index=df.index, columns=df.columns), - ) - tm.assert_frame_equal( - align(df, val, "columns")[1], - DataFrame(val, index=df.index, columns=df.columns), - ) - - # shape mismatch - msg = "Unable to coerce to DataFrame, shape must be" - val = np.array([[1, 2, 3], [4, 5, 6]]) - with pytest.raises(ValueError, match=msg): - align(df, val, "index") - - with pytest.raises(ValueError, match=msg): - align(df, val, "columns") - - val = np.zeros((3, 3, 3)) - with pytest.raises(ValueError): - align(df, val, "index") - with pytest.raises(ValueError): - align(df, val, "columns") - - def test_no_warning(self, all_arithmetic_operators): - df = pd.DataFrame({"A": [0.0, 0.0], "B": [0.0, None]}) - b = df["B"] - with tm.assert_produces_warning(None): - getattr(df, all_arithmetic_operators)(b, 0) diff --git a/pandas/tests/frame/test_period.py b/pandas/tests/frame/test_period.py index 1ce13fd31ba88..c378194b9e2b2 100644 --- a/pandas/tests/frame/test_period.py +++ b/pandas/tests/frame/test_period.py @@ -4,10 +4,6 @@ import pandas._testing as tm -def _permute(obj): - return obj.take(np.random.permutation(len(obj))) - - class TestPeriodIndex: def test_as_frame_columns(self): rng = period_range("1/1/2000", periods=5) @@ -42,15 +38,3 @@ def test_frame_index_to_string(self): # it works! frame.to_string() - - def test_align_frame(self): - rng = period_range("1/1/2000", "1/1/2010", freq="A") - ts = DataFrame(np.random.randn(len(rng), 3), index=rng) - - result = ts + ts[::2] - expected = ts + ts - expected.values[1::2] = np.nan - tm.assert_frame_equal(result, expected) - - result = ts + _permute(ts[::2]) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index bf9eeb532b43b..1a07780462ea3 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1165,6 +1165,11 @@ def test_lots_of_operators_string(self, df): expect = df[df[" &^ :!€$?(} > <++*'' "] > 4] tm.assert_frame_equal(res, expect) + def test_missing_attribute(self, df): + message = "module 'pandas' has no attribute 'thing'" + with pytest.raises(AttributeError, match=message): + df.eval("@pd.thing") + def test_failing_quote(self, df): with pytest.raises(SyntaxError): df.query("`it's` > `that's`") diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index c5d4d59adbc35..6d786d9580542 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -17,9 +17,6 @@ import pandas.io.formats.format as fmt -# Segregated collection of methods that require the BlockManager internal data -# structure - class TestDataFrameReprInfoEtc: def test_repr_empty(self): @@ -137,6 +134,10 @@ def test_unicode_string_with_unicode(self): df = DataFrame({"A": ["\u05d0"]}) str(df) + def test_repr_unicode_columns(self): + df = DataFrame({"\u05d0": [1, 2, 3], "\u05d1": [4, 5, 6], "c": [7, 8, 9]}) + repr(df.columns) # should not raise UnicodeDecodeError + def test_str_to_bytes_raises(self): # GH 26447 df = DataFrame({"A": ["abc"]}) @@ -212,3 +213,8 @@ def test_repr_np_nat_with_object(self, arg, box, expected): # GH 25445 result = repr(box([arg("NaT")], dtype=object)) assert result == expected + + def test_frame_datetime64_pre1900_repr(self): + df = DataFrame({"year": date_range("1/1/1700", periods=50, freq="A-DEC")}) + # it works! + repr(df) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 46a4a0a2af4ba..9d3c40ce926d7 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -695,10 +695,11 @@ def test_unstack_dtypes(self): def test_unstack_non_unique_index_names(self): idx = MultiIndex.from_tuples([("a", "b"), ("c", "d")], names=["c1", "c1"]) df = DataFrame([1, 2], index=idx) - with pytest.raises(ValueError): + msg = "The name c1 occurs multiple times, use a level number" + with pytest.raises(ValueError, match=msg): df.unstack("c1") - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): df.T.stack("c1") def test_unstack_unused_levels(self): @@ -764,6 +765,60 @@ def test_unstack_unused_level(self, cols): expected.index = expected.index.droplevel("C") tm.assert_frame_equal(result, expected) + def test_unstack_long_index(self): + # PH 32624: Error when using a lot of indices to unstack. + # The error occurred only, if a lot of indices are used. + df = pd.DataFrame( + [[1]], + columns=pd.MultiIndex.from_tuples([[0]], names=["c1"]), + index=pd.MultiIndex.from_tuples( + [[0, 0, 1, 0, 0, 0, 1]], + names=["i1", "i2", "i3", "i4", "i5", "i6", "i7"], + ), + ) + result = df.unstack(["i2", "i3", "i4", "i5", "i6", "i7"]) + expected = pd.DataFrame( + [[1]], + columns=pd.MultiIndex.from_tuples( + [[0, 0, 1, 0, 0, 0, 1]], + names=["c1", "i2", "i3", "i4", "i5", "i6", "i7"], + ), + index=pd.Index([0], name="i1"), + ) + tm.assert_frame_equal(result, expected) + + def test_unstack_multi_level_cols(self): + # PH 24729: Unstack a df with multi level columns + df = pd.DataFrame( + [[0.0, 0.0], [0.0, 0.0]], + columns=pd.MultiIndex.from_tuples( + [["B", "C"], ["B", "D"]], names=["c1", "c2"] + ), + index=pd.MultiIndex.from_tuples( + [[10, 20, 30], [10, 20, 40]], names=["i1", "i2", "i3"], + ), + ) + assert df.unstack(["i2", "i1"]).columns.names[-2:] == ["i2", "i1"] + + def test_unstack_multi_level_rows_and_cols(self): + # PH 28306: Unstack df with multi level cols and rows + df = pd.DataFrame( + [[1, 2], [3, 4], [-1, -2], [-3, -4]], + columns=pd.MultiIndex.from_tuples([["a", "b", "c"], ["d", "e", "f"]]), + index=pd.MultiIndex.from_tuples( + [ + ["m1", "P3", 222], + ["m1", "A5", 111], + ["m2", "P3", 222], + ["m2", "A5", 111], + ], + names=["i1", "i2", "i3"], + ), + ) + result = df.unstack(["i3", "i2"]) + expected = df.unstack(["i3"]).unstack(["i2"]) + tm.assert_frame_equal(result, expected) + def test_unstack_nan_index(self): # GH7466 def cast(val): val_str = "" if val != val else val diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index a2e7dc527c4b8..16bf651829a04 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -163,12 +163,14 @@ def test_subclass_align_combinations(self): # frame + series res1, res2 = df.align(s, axis=0) - exp1 = pd.DataFrame( + exp1 = tm.SubclassedDataFrame( {"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]}, index=list("ABCDE"), ) # name is lost when - exp2 = pd.Series([1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x") + exp2 = tm.SubclassedSeries( + [1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x" + ) assert isinstance(res1, tm.SubclassedDataFrame) tm.assert_frame_equal(res1, exp1) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index b713af92eac27..63361789b8e50 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -1,8 +1,7 @@ import numpy as np -import pytest import pandas as pd -from pandas import DataFrame, Series, date_range, to_datetime +from pandas import DataFrame, date_range, to_datetime import pandas._testing as tm @@ -14,18 +13,6 @@ def test_frame_ctor_datetime64_column(self): df = DataFrame({"A": np.random.randn(len(rng)), "B": dates}) assert np.issubdtype(df["B"].dtype, np.dtype("M8[ns]")) - def test_frame_append_datetime64_column(self): - rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") - df = DataFrame(index=np.arange(len(rng))) - - df["A"] = rng - assert np.issubdtype(df["A"].dtype, np.dtype("M8[ns]")) - - def test_frame_datetime64_pre1900_repr(self): - df = DataFrame({"year": date_range("1/1/1700", periods=50, freq="A-DEC")}) - # it works! - repr(df) - def test_frame_append_datetime64_col_other_units(self): n = 100 @@ -60,140 +47,6 @@ def test_frame_append_datetime64_col_other_units(self): assert (tmp["dates"].values == ex_vals).all() - @pytest.mark.parametrize( - "data,idx,expected_first,expected_last", - [ - ({"A": [1, 2, 3]}, [1, 1, 2], 1, 2), - ({"A": [1, 2, 3]}, [1, 2, 2], 1, 2), - ({"A": [1, 2, 3, 4]}, ["d", "d", "d", "d"], "d", "d"), - ({"A": [1, np.nan, 3]}, [1, 1, 2], 1, 2), - ({"A": [np.nan, np.nan, 3]}, [1, 1, 2], 2, 2), - ({"A": [1, np.nan, 3]}, [1, 2, 2], 1, 2), - ], - ) - def test_first_last_valid( - self, float_frame, data, idx, expected_first, expected_last - ): - N = len(float_frame.index) - mat = np.random.randn(N) - mat[:5] = np.nan - mat[-5:] = np.nan - - frame = DataFrame({"foo": mat}, index=float_frame.index) - index = frame.first_valid_index() - - assert index == frame.index[5] - - index = frame.last_valid_index() - assert index == frame.index[-6] - - # GH12800 - empty = DataFrame() - assert empty.last_valid_index() is None - assert empty.first_valid_index() is None - - # GH17400: no valid entries - frame[:] = np.nan - assert frame.last_valid_index() is None - assert frame.first_valid_index() is None - - # GH20499: its preserves freq with holes - frame.index = date_range("20110101", periods=N, freq="B") - frame.iloc[1] = 1 - frame.iloc[-2] = 1 - assert frame.first_valid_index() == frame.index[1] - assert frame.last_valid_index() == frame.index[-2] - assert frame.first_valid_index().freq == frame.index.freq - assert frame.last_valid_index().freq == frame.index.freq - - # GH 21441 - df = DataFrame(data, index=idx) - assert expected_first == df.first_valid_index() - assert expected_last == df.last_valid_index() - - @pytest.mark.parametrize("klass", [Series, DataFrame]) - def test_first_valid_index_all_nan(self, klass): - # GH#9752 Series/DataFrame should both return None, not raise - obj = klass([np.nan]) - - assert obj.first_valid_index() is None - assert obj.iloc[:0].first_valid_index() is None - - def test_first_subset(self): - ts = tm.makeTimeDataFrame(freq="12h") - result = ts.first("10d") - assert len(result) == 20 - - ts = tm.makeTimeDataFrame(freq="D") - result = ts.first("10d") - assert len(result) == 10 - - result = ts.first("3M") - expected = ts[:"3/31/2000"] - tm.assert_frame_equal(result, expected) - - result = ts.first("21D") - expected = ts[:21] - tm.assert_frame_equal(result, expected) - - result = ts[:0].first("3M") - tm.assert_frame_equal(result, ts[:0]) - - def test_first_raises(self): - # GH20725 - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) - with pytest.raises(TypeError): # index is not a DatetimeIndex - df.first("1D") - - def test_last_subset(self): - ts = tm.makeTimeDataFrame(freq="12h") - result = ts.last("10d") - assert len(result) == 20 - - ts = tm.makeTimeDataFrame(nper=30, freq="D") - result = ts.last("10d") - assert len(result) == 10 - - result = ts.last("21D") - expected = ts["2000-01-10":] - tm.assert_frame_equal(result, expected) - - result = ts.last("21D") - expected = ts[-21:] - tm.assert_frame_equal(result, expected) - - result = ts[:0].last("3M") - tm.assert_frame_equal(result, ts[:0]) - - def test_last_raises(self): - # GH20725 - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) - with pytest.raises(TypeError): # index is not a DatetimeIndex - df.last("1D") - - def test_operation_on_NaT(self): - # Both NaT and Timestamp are in DataFrame. - df = pd.DataFrame({"foo": [pd.NaT, pd.NaT, pd.Timestamp("2012-05-01")]}) - - res = df.min() - exp = pd.Series([pd.Timestamp("2012-05-01")], index=["foo"]) - tm.assert_series_equal(res, exp) - - res = df.max() - exp = pd.Series([pd.Timestamp("2012-05-01")], index=["foo"]) - tm.assert_series_equal(res, exp) - - # GH12941, only NaTs are in DataFrame. - df = pd.DataFrame({"foo": [pd.NaT, pd.NaT]}) - - res = df.min() - exp = pd.Series([pd.NaT], index=["foo"]) - tm.assert_series_equal(res, exp) - - res = df.max() - exp = pd.Series([pd.NaT], index=["foo"]) - tm.assert_series_equal(res, exp) - def test_datetime_assignment_with_NaT_and_diff_time_units(self): # GH 7492 data_ns = np.array([1, "nat"], dtype="datetime64[ns]") diff --git a/pandas/tests/frame/test_timezones.py b/pandas/tests/frame/test_timezones.py index 62e8a4b470218..dfd4fb1855383 100644 --- a/pandas/tests/frame/test_timezones.py +++ b/pandas/tests/frame/test_timezones.py @@ -1,8 +1,6 @@ """ Tests for DataFrame timezone-related methods """ -from datetime import datetime - import numpy as np import pytest import pytz @@ -53,12 +51,6 @@ def test_frame_values_with_tz(self): result = df.values tm.assert_numpy_array_equal(result, expected) - def test_frame_from_records_utc(self): - rec = {"datum": 1.5, "begin_time": datetime(2006, 4, 27, tzinfo=pytz.utc)} - - # it works - DataFrame.from_records([rec], index="begin_time") - def test_frame_join_tzaware(self): test1 = DataFrame( np.zeros((6, 3)), @@ -80,17 +72,6 @@ def test_frame_join_tzaware(self): tm.assert_index_equal(result.index, ex_index) assert result.index.tz.zone == "US/Central" - def test_frame_add_tz_mismatch_converts_to_utc(self): - rng = date_range("1/1/2011", periods=10, freq="H", tz="US/Eastern") - df = DataFrame(np.random.randn(len(rng)), index=rng, columns=["a"]) - - df_moscow = df.tz_convert("Europe/Moscow") - result = df + df_moscow - assert result.index.tz is pytz.utc - - result = df_moscow + df - assert result.index.tz is pytz.utc - def test_frame_align_aware(self): idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern") idx2 = date_range("2001", periods=5, freq="2H", tz="US/Eastern") diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 86c9a98377f3f..a9d9d0ace8701 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -250,9 +250,7 @@ def make_dtnat_arr(n, nnat=None): df.to_csv(pth, chunksize=chunksize) recons = self.read_csv(pth)._convert(datetime=True, coerce=True) - tm.assert_frame_equal( - df, recons, check_names=False, check_less_precise=True - ) + tm.assert_frame_equal(df, recons, check_names=False) @pytest.mark.slow def test_to_csv_moar(self): @@ -354,9 +352,7 @@ def _to_uni(x): recons.columns = np.array(recons.columns, dtype=c_dtype) df.columns = np.array(df.columns, dtype=c_dtype) - tm.assert_frame_equal( - df, recons, check_names=False, check_less_precise=True - ) + tm.assert_frame_equal(df, recons, check_names=False) N = 100 chunksize = 1000 @@ -761,7 +757,7 @@ def create_cols(name): ) # add in some nans - df_float.loc[30:50, 1:3] = np.nan + df_float.iloc[30:50, 1:3] = np.nan # ## this is a bug in read_csv right now #### # df_dt.loc[30:50,1:3] = np.nan @@ -1356,3 +1352,12 @@ def test_gz_lineend(self): result = f.read().decode("utf-8") assert result == expected + + def test_to_csv_numpy_16_bug(self): + frame = DataFrame({"a": date_range("1/1/2000", periods=10)}) + + buf = StringIO() + frame.to_csv(buf) + + result = buf.getvalue() + assert "2000-01-01" in result diff --git a/pandas/tests/generic/methods/__init__.py b/pandas/tests/generic/methods/__init__.py new file mode 100644 index 0000000000000..5d18f97b8a55e --- /dev/null +++ b/pandas/tests/generic/methods/__init__.py @@ -0,0 +1,3 @@ +""" +Tests for methods shared by DataFrame and Series. +""" diff --git a/pandas/tests/generic/methods/test_dot.py b/pandas/tests/generic/methods/test_dot.py new file mode 100644 index 0000000000000..ecbec6b06e923 --- /dev/null +++ b/pandas/tests/generic/methods/test_dot.py @@ -0,0 +1,128 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm + + +class DotSharedTests: + @pytest.fixture + def obj(self): + raise NotImplementedError + + @pytest.fixture + def other(self) -> DataFrame: + """ + other is a DataFrame that is indexed so that obj.dot(other) is valid + """ + raise NotImplementedError + + @pytest.fixture + def expected(self, obj, other) -> DataFrame: + """ + The expected result of obj.dot(other) + """ + raise NotImplementedError + + @classmethod + def reduced_dim_assert(cls, result, expected): + """ + Assertion about results with 1 fewer dimension that self.obj + """ + raise NotImplementedError + + def test_dot_equiv_values_dot(self, obj, other, expected): + # `expected` is constructed from obj.values.dot(other.values) + result = obj.dot(other) + tm.assert_equal(result, expected) + + def test_dot_2d_ndarray(self, obj, other, expected): + # Check ndarray argument; in this case we get matching values, + # but index/columns may not match + result = obj.dot(other.values) + assert np.all(result == expected.values) + + def test_dot_1d_ndarray(self, obj, expected): + # can pass correct-length array + row = obj.iloc[0] if obj.ndim == 2 else obj + + result = obj.dot(row.values) + expected = obj.dot(row) + self.reduced_dim_assert(result, expected) + + def test_dot_series(self, obj, other, expected): + # Check series argument + result = obj.dot(other["1"]) + self.reduced_dim_assert(result, expected["1"]) + + def test_dot_series_alignment(self, obj, other, expected): + result = obj.dot(other.iloc[::-1]["1"]) + self.reduced_dim_assert(result, expected["1"]) + + def test_dot_aligns(self, obj, other, expected): + # Check index alignment + other2 = other.iloc[::-1] + result = obj.dot(other2) + tm.assert_equal(result, expected) + + def test_dot_shape_mismatch(self, obj): + msg = "Dot product shape mismatch" + # exception raised is of type Exception + with pytest.raises(Exception, match=msg): + obj.dot(obj.values[:3]) + + def test_dot_misaligned(self, obj, other): + msg = "matrices are not aligned" + with pytest.raises(ValueError, match=msg): + obj.dot(other.T) + + +class TestSeriesDot(DotSharedTests): + @pytest.fixture + def obj(self): + return Series(np.random.randn(4), index=["p", "q", "r", "s"]) + + @pytest.fixture + def other(self): + return DataFrame( + np.random.randn(3, 4), index=["1", "2", "3"], columns=["p", "q", "r", "s"] + ).T + + @pytest.fixture + def expected(self, obj, other): + return Series(np.dot(obj.values, other.values), index=other.columns) + + @classmethod + def reduced_dim_assert(cls, result, expected): + """ + Assertion about results with 1 fewer dimension that self.obj + """ + tm.assert_almost_equal(result, expected) + + +class TestDataFrameDot(DotSharedTests): + @pytest.fixture + def obj(self): + return DataFrame( + np.random.randn(3, 4), index=["a", "b", "c"], columns=["p", "q", "r", "s"] + ) + + @pytest.fixture + def other(self): + return DataFrame( + np.random.randn(4, 2), index=["p", "q", "r", "s"], columns=["1", "2"] + ) + + @pytest.fixture + def expected(self, obj, other): + return DataFrame( + np.dot(obj.values, other.values), index=obj.index, columns=other.columns + ) + + @classmethod + def reduced_dim_assert(cls, result, expected): + """ + Assertion about results with 1 fewer dimension that self.obj + """ + tm.assert_series_equal(result, expected, check_names=False) + assert result.name is None diff --git a/pandas/tests/generic/methods/test_first_valid_index.py b/pandas/tests/generic/methods/test_first_valid_index.py new file mode 100644 index 0000000000000..bca3452c3c458 --- /dev/null +++ b/pandas/tests/generic/methods/test_first_valid_index.py @@ -0,0 +1,90 @@ +""" +Includes test for last_valid_index. +""" +import numpy as np +import pytest + +from pandas import DataFrame, Series, date_range +import pandas._testing as tm + + +class TestFirstValidIndex: + @pytest.mark.parametrize("klass", [Series, DataFrame]) + def test_first_valid_index_single_nan(self, klass): + # GH#9752 Series/DataFrame should both return None, not raise + obj = klass([np.nan]) + + assert obj.first_valid_index() is None + assert obj.iloc[:0].first_valid_index() is None + + @pytest.mark.parametrize( + "empty", [DataFrame(), Series(dtype=object), Series([], index=[], dtype=object)] + ) + def test_first_valid_index_empty(self, empty): + # GH#12800 + assert empty.last_valid_index() is None + assert empty.first_valid_index() is None + + @pytest.mark.parametrize( + "data,idx,expected_first,expected_last", + [ + ({"A": [1, 2, 3]}, [1, 1, 2], 1, 2), + ({"A": [1, 2, 3]}, [1, 2, 2], 1, 2), + ({"A": [1, 2, 3, 4]}, ["d", "d", "d", "d"], "d", "d"), + ({"A": [1, np.nan, 3]}, [1, 1, 2], 1, 2), + ({"A": [np.nan, np.nan, 3]}, [1, 1, 2], 2, 2), + ({"A": [1, np.nan, 3]}, [1, 2, 2], 1, 2), + ], + ) + def test_first_last_valid_frame(self, data, idx, expected_first, expected_last): + # GH#21441 + df = DataFrame(data, index=idx) + assert expected_first == df.first_valid_index() + assert expected_last == df.last_valid_index() + + @pytest.mark.parametrize("index_func", [tm.makeStringIndex, tm.makeDateIndex]) + def test_first_last_valid(self, index_func): + N = 30 + index = index_func(N) + mat = np.random.randn(N) + mat[:5] = np.nan + mat[-5:] = np.nan + + frame = DataFrame({"foo": mat}, index=index) + assert frame.first_valid_index() == frame.index[5] + assert frame.last_valid_index() == frame.index[-6] + + ser = frame["foo"] + assert ser.first_valid_index() == frame.index[5] + assert ser.last_valid_index() == frame.index[-6] + + @pytest.mark.parametrize("index_func", [tm.makeStringIndex, tm.makeDateIndex]) + def test_first_last_valid_all_nan(self, index_func): + # GH#17400: no valid entries + index = index_func(30) + frame = DataFrame(np.nan, columns=["foo"], index=index) + + assert frame.last_valid_index() is None + assert frame.first_valid_index() is None + + ser = frame["foo"] + assert ser.first_valid_index() is None + assert ser.last_valid_index() is None + + def test_first_last_valid_preserves_freq(self): + # GH#20499: its preserves freq with holes + index = date_range("20110101", periods=30, freq="B") + frame = DataFrame(np.nan, columns=["foo"], index=index) + + frame.iloc[1] = 1 + frame.iloc[-2] = 1 + assert frame.first_valid_index() == frame.index[1] + assert frame.last_valid_index() == frame.index[-2] + assert frame.first_valid_index().freq == frame.index.freq + assert frame.last_valid_index().freq == frame.index.freq + + ts = frame["foo"] + assert ts.first_valid_index() == ts.index[1] + assert ts.last_valid_index() == ts.index[-2] + assert ts.first_valid_index().freq == ts.index.freq + assert ts.last_valid_index().freq == ts.index.freq diff --git a/pandas/tests/generic/methods/test_reorder_levels.py b/pandas/tests/generic/methods/test_reorder_levels.py new file mode 100644 index 0000000000000..8bb6417e56659 --- /dev/null +++ b/pandas/tests/generic/methods/test_reorder_levels.py @@ -0,0 +1,73 @@ +import numpy as np +import pytest + +from pandas import DataFrame, MultiIndex, Series +import pandas._testing as tm + + +class TestReorderLevels: + @pytest.mark.parametrize("klass", [Series, DataFrame]) + def test_reorder_levels(self, klass): + index = MultiIndex( + levels=[["bar"], ["one", "two", "three"], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + names=["L0", "L1", "L2"], + ) + df = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=index) + obj = df if klass is DataFrame else df["A"] + + # no change, position + result = obj.reorder_levels([0, 1, 2]) + tm.assert_equal(obj, result) + + # no change, labels + result = obj.reorder_levels(["L0", "L1", "L2"]) + tm.assert_equal(obj, result) + + # rotate, position + result = obj.reorder_levels([1, 2, 0]) + e_idx = MultiIndex( + levels=[["one", "two", "three"], [0, 1], ["bar"]], + codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1], [0, 0, 0, 0, 0, 0]], + names=["L1", "L2", "L0"], + ) + expected = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=e_idx) + expected = expected if klass is DataFrame else expected["A"] + tm.assert_equal(result, expected) + + result = obj.reorder_levels([0, 0, 0]) + e_idx = MultiIndex( + levels=[["bar"], ["bar"], ["bar"]], + codes=[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]], + names=["L0", "L0", "L0"], + ) + expected = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=e_idx) + expected = expected if klass is DataFrame else expected["A"] + tm.assert_equal(result, expected) + + result = obj.reorder_levels(["L0", "L0", "L0"]) + tm.assert_equal(result, expected) + + def test_reorder_levels_swaplevel_equivalence( + self, multiindex_year_month_day_dataframe_random_data + ): + + ymd = multiindex_year_month_day_dataframe_random_data + + result = ymd.reorder_levels(["month", "day", "year"]) + expected = ymd.swaplevel(0, 1).swaplevel(1, 2) + tm.assert_frame_equal(result, expected) + + result = ymd["A"].reorder_levels(["month", "day", "year"]) + expected = ymd["A"].swaplevel(0, 1).swaplevel(1, 2) + tm.assert_series_equal(result, expected) + + result = ymd.T.reorder_levels(["month", "day", "year"], axis=1) + expected = ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1) + tm.assert_frame_equal(result, expected) + + with pytest.raises(TypeError, match="hierarchical axis"): + ymd.reorder_levels([1, 2], axis=1) + + with pytest.raises(IndexError, match="Too many levels"): + ymd.index.reorder_levels([1, 2, 3]) diff --git a/pandas/tests/generic/methods/test_set_axis.py b/pandas/tests/generic/methods/test_set_axis.py new file mode 100644 index 0000000000000..278d43ef93d2f --- /dev/null +++ b/pandas/tests/generic/methods/test_set_axis.py @@ -0,0 +1,75 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm + + +class SharedSetAxisTests: + @pytest.fixture + def obj(self): + raise NotImplementedError("Implemented by subclasses") + + def test_set_axis(self, obj): + # GH14636; this tests setting index for both Series and DataFrame + new_index = list("abcd")[: len(obj)] + + expected = obj.copy() + expected.index = new_index + + # inplace=False + result = obj.set_axis(new_index, axis=0, inplace=False) + tm.assert_equal(expected, result) + + @pytest.mark.parametrize("axis", [0, "index", 1, "columns"]) + def test_set_axis_inplace_axis(self, axis, obj): + # GH#14636 + if obj.ndim == 1 and axis in [1, "columns"]: + # Series only has [0, "index"] + return + + new_index = list("abcd")[: len(obj)] + + expected = obj.copy() + if axis in [0, "index"]: + expected.index = new_index + else: + expected.columns = new_index + + result = obj.copy() + result.set_axis(new_index, axis=axis, inplace=True) + tm.assert_equal(result, expected) + + def test_set_axis_unnamed_kwarg_warns(self, obj): + # omitting the "axis" parameter + new_index = list("abcd")[: len(obj)] + + expected = obj.copy() + expected.index = new_index + + with tm.assert_produces_warning(None): + result = obj.set_axis(new_index, inplace=False) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("axis", [3, "foo"]) + def test_set_axis_invalid_axis_name(self, axis, obj): + # wrong values for the "axis" parameter + with pytest.raises(ValueError, match="No axis named"): + obj.set_axis(list("abc"), axis=axis) + + +class TestDataFrameSetAxis(SharedSetAxisTests): + @pytest.fixture + def obj(self): + df = DataFrame( + {"A": [1.1, 2.2, 3.3], "B": [5.0, 6.1, 7.2], "C": [4.4, 5.5, 6.6]}, + index=[2010, 2011, 2012], + ) + return df + + +class TestSeriesSetAxis(SharedSetAxisTests): + @pytest.fixture + def obj(self): + ser = Series(np.arange(4), index=[1, 3, 5, 7], dtype="int64") + return ser diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py new file mode 100644 index 0000000000000..d307eef8beb62 --- /dev/null +++ b/pandas/tests/generic/test_finalize.py @@ -0,0 +1,782 @@ +""" +An exhaustive list of pandas methods exercising NDFrame.__finalize__. +""" +import operator +import re + +import numpy as np +import pytest + +import pandas as pd + +# TODO: +# * Binary methods (mul, div, etc.) +# * Binary outputs (align, etc.) +# * top-level methods (concat, merge, get_dummies, etc.) +# * window +# * cumulative reductions + +not_implemented_mark = pytest.mark.xfail(reason="not implemented") + +mi = pd.MultiIndex.from_product([["a", "b"], [0, 1]], names=["A", "B"]) + +frame_data = ({"A": [1]},) +frame_mi_data = ({"A": [1, 2, 3, 4]}, mi) + + +# Tuple of +# - Callable: Constructor (Series, DataFrame) +# - Tuple: Constructor args +# - Callable: pass the constructed value with attrs set to this. + +_all_methods = [ + ( + pd.Series, + (np.array([0], dtype="float64")), + operator.methodcaller("view", "int64"), + ), + (pd.Series, ([0],), operator.methodcaller("take", [])), + (pd.Series, ([0],), operator.methodcaller("__getitem__", [True])), + (pd.Series, ([0],), operator.methodcaller("repeat", 2)), + pytest.param( + (pd.Series, ([0],), operator.methodcaller("reset_index")), + marks=pytest.mark.xfail, + ), + (pd.Series, ([0],), operator.methodcaller("reset_index", drop=True)), + pytest.param( + (pd.Series, ([0],), operator.methodcaller("to_frame")), marks=pytest.mark.xfail + ), + (pd.Series, (0, mi), operator.methodcaller("count", level="A")), + (pd.Series, ([0, 0],), operator.methodcaller("drop_duplicates")), + (pd.Series, ([0, 0],), operator.methodcaller("duplicated")), + (pd.Series, ([0, 0],), operator.methodcaller("round")), + (pd.Series, ([0, 0],), operator.methodcaller("rename", lambda x: x + 1)), + (pd.Series, ([0, 0],), operator.methodcaller("rename", "name")), + (pd.Series, ([0, 0],), operator.methodcaller("set_axis", ["a", "b"])), + (pd.Series, ([0, 0],), operator.methodcaller("reindex", [1, 0])), + (pd.Series, ([0, 0],), operator.methodcaller("drop", [0])), + (pd.Series, (pd.array([0, pd.NA]),), operator.methodcaller("fillna", 0)), + (pd.Series, ([0, 0],), operator.methodcaller("replace", {0: 1})), + (pd.Series, ([0, 0],), operator.methodcaller("shift")), + (pd.Series, ([0, 0],), operator.methodcaller("isin", [0, 1])), + (pd.Series, ([0, 0],), operator.methodcaller("between", 0, 2)), + (pd.Series, ([0, 0],), operator.methodcaller("isna")), + (pd.Series, ([0, 0],), operator.methodcaller("isnull")), + (pd.Series, ([0, 0],), operator.methodcaller("notna")), + (pd.Series, ([0, 0],), operator.methodcaller("notnull")), + (pd.Series, ([1],), operator.methodcaller("add", pd.Series([1]))), + # TODO: mul, div, etc. + ( + pd.Series, + ([0], pd.period_range("2000", periods=1)), + operator.methodcaller("to_timestamp"), + ), + ( + pd.Series, + ([0], pd.date_range("2000", periods=1)), + operator.methodcaller("to_period"), + ), + pytest.param( + ( + pd.DataFrame, + frame_data, + operator.methodcaller("dot", pd.DataFrame(index=["A"])), + ), + marks=pytest.mark.xfail(reason="Implement binary finalize"), + ), + (pd.DataFrame, frame_data, operator.methodcaller("transpose")), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("__getitem__", "A")), + marks=not_implemented_mark, + ), + (pd.DataFrame, frame_data, operator.methodcaller("__getitem__", ["A"])), + (pd.DataFrame, frame_data, operator.methodcaller("__getitem__", np.array([True]))), + (pd.DataFrame, ({("A", "a"): [1]},), operator.methodcaller("__getitem__", ["A"])), + (pd.DataFrame, frame_data, operator.methodcaller("query", "A == 1")), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("eval", "A + 1")), + marks=not_implemented_mark, + ), + (pd.DataFrame, frame_data, operator.methodcaller("select_dtypes", include="int")), + (pd.DataFrame, frame_data, operator.methodcaller("assign", b=1)), + (pd.DataFrame, frame_data, operator.methodcaller("set_axis", ["A"])), + (pd.DataFrame, frame_data, operator.methodcaller("reindex", [0, 1])), + (pd.DataFrame, frame_data, operator.methodcaller("drop", columns=["A"])), + (pd.DataFrame, frame_data, operator.methodcaller("drop", index=[0])), + (pd.DataFrame, frame_data, operator.methodcaller("rename", columns={"A": "a"})), + (pd.DataFrame, frame_data, operator.methodcaller("rename", index=lambda x: x)), + (pd.DataFrame, frame_data, operator.methodcaller("fillna", "A")), + (pd.DataFrame, frame_data, operator.methodcaller("fillna", method="ffill")), + (pd.DataFrame, frame_data, operator.methodcaller("set_index", "A")), + (pd.DataFrame, frame_data, operator.methodcaller("reset_index")), + (pd.DataFrame, frame_data, operator.methodcaller("isna")), + (pd.DataFrame, frame_data, operator.methodcaller("isnull")), + (pd.DataFrame, frame_data, operator.methodcaller("notna")), + (pd.DataFrame, frame_data, operator.methodcaller("notnull")), + (pd.DataFrame, frame_data, operator.methodcaller("dropna")), + (pd.DataFrame, frame_data, operator.methodcaller("drop_duplicates")), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("duplicated")), + marks=not_implemented_mark, + ), + (pd.DataFrame, frame_data, operator.methodcaller("sort_values", by="A")), + (pd.DataFrame, frame_data, operator.methodcaller("sort_index")), + (pd.DataFrame, frame_data, operator.methodcaller("nlargest", 1, "A")), + (pd.DataFrame, frame_data, operator.methodcaller("nsmallest", 1, "A")), + (pd.DataFrame, frame_mi_data, operator.methodcaller("swaplevel"),), + pytest.param( + ( + pd.DataFrame, + frame_data, + operator.methodcaller("add", pd.DataFrame(*frame_data)), + ), + marks=not_implemented_mark, + ), + # TODO: div, mul, etc. + pytest.param( + ( + pd.DataFrame, + frame_data, + operator.methodcaller("combine", pd.DataFrame(*frame_data), operator.add), + ), + marks=not_implemented_mark, + ), + pytest.param( + ( + pd.DataFrame, + frame_data, + operator.methodcaller("combine_first", pd.DataFrame(*frame_data)), + ), + marks=not_implemented_mark, + ), + pytest.param( + ( + pd.DataFrame, + frame_data, + operator.methodcaller("update", pd.DataFrame(*frame_data)), + ), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("pivot", columns="A")), + marks=not_implemented_mark, + ), + pytest.param( + ( + pd.DataFrame, + {"A": [1], "B": [1]}, + operator.methodcaller("pivot_table", columns="A"), + ), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("stack")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("explode", "A")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_mi_data, operator.methodcaller("unstack"),), + marks=not_implemented_mark, + ), + pytest.param( + ( + pd.DataFrame, + ({"A": ["a", "b", "c"], "B": [1, 3, 5], "C": [2, 4, 6]},), + operator.methodcaller("melt", id_vars=["A"], value_vars=["B"]), + ), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("diff")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("applymap", lambda x: x)), + marks=not_implemented_mark, + ), + pytest.param( + ( + pd.DataFrame, + frame_data, + operator.methodcaller("append", pd.DataFrame({"A": [1]})), + ), + marks=not_implemented_mark, + ), + pytest.param( + ( + pd.DataFrame, + frame_data, + operator.methodcaller("append", pd.DataFrame({"B": [1]})), + ), + marks=not_implemented_mark, + ), + pytest.param( + ( + pd.DataFrame, + frame_data, + operator.methodcaller("merge", pd.DataFrame({"A": [1]})), + ), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("round", 2)), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("corr")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("cov")), + marks=[ + not_implemented_mark, + pytest.mark.filterwarnings("ignore::RuntimeWarning"), + ], + ), + pytest.param( + ( + pd.DataFrame, + frame_data, + operator.methodcaller("corrwith", pd.DataFrame(*frame_data)), + ), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("count")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_mi_data, operator.methodcaller("count", level="A")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("nunique")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("idxmin")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("idxmax")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("mode")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("quantile")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("quantile", q=[0.25, 0.75])), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("quantile")), + marks=not_implemented_mark, + ), + ( + pd.DataFrame, + ({"A": [1]}, [pd.Period("2000", "D")]), + operator.methodcaller("to_timestamp"), + ), + ( + pd.DataFrame, + ({"A": [1]}, [pd.Timestamp("2000")]), + operator.methodcaller("to_period", freq="D"), + ), + pytest.param( + (pd.DataFrame, frame_mi_data, operator.methodcaller("isin", [1])), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_mi_data, operator.methodcaller("isin", pd.Series([1]))), + marks=not_implemented_mark, + ), + pytest.param( + ( + pd.DataFrame, + frame_mi_data, + operator.methodcaller("isin", pd.DataFrame({"A": [1]})), + ), + marks=not_implemented_mark, + ), + (pd.DataFrame, frame_data, operator.methodcaller("swapaxes", 0, 1)), + (pd.DataFrame, frame_mi_data, operator.methodcaller("droplevel", "A")), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("pop", "A")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("squeeze")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.Series, ([1, 2],), operator.methodcaller("squeeze")), + # marks=not_implemented_mark, + ), + (pd.Series, ([1, 2],), operator.methodcaller("rename_axis", index="a")), + (pd.DataFrame, frame_data, operator.methodcaller("rename_axis", columns="a")), + # Unary ops + (pd.DataFrame, frame_data, operator.neg), + (pd.Series, [1], operator.neg), + (pd.DataFrame, frame_data, operator.pos), + (pd.Series, [1], operator.pos), + (pd.DataFrame, frame_data, operator.inv), + (pd.Series, [1], operator.inv), + (pd.DataFrame, frame_data, abs), + pytest.param((pd.Series, [1], abs), marks=not_implemented_mark), + pytest.param((pd.DataFrame, frame_data, round), marks=not_implemented_mark), + (pd.Series, [1], round), + (pd.DataFrame, frame_data, operator.methodcaller("take", [0, 0])), + (pd.DataFrame, frame_mi_data, operator.methodcaller("xs", "a")), + (pd.Series, (1, mi), operator.methodcaller("xs", "a")), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("get", "A")), + marks=not_implemented_mark, + ), + ( + pd.DataFrame, + frame_data, + operator.methodcaller("reindex_like", pd.DataFrame({"A": [1, 2, 3]})), + ), + ( + pd.Series, + frame_data, + operator.methodcaller("reindex_like", pd.Series([0, 1, 2])), + ), + (pd.DataFrame, frame_data, operator.methodcaller("add_prefix", "_")), + (pd.DataFrame, frame_data, operator.methodcaller("add_suffix", "_")), + (pd.Series, (1, ["a", "b"]), operator.methodcaller("add_prefix", "_")), + (pd.Series, (1, ["a", "b"]), operator.methodcaller("add_suffix", "_")), + (pd.Series, ([3, 2],), operator.methodcaller("sort_values")), + (pd.Series, ([1] * 10,), operator.methodcaller("head")), + (pd.DataFrame, ({"A": [1] * 10},), operator.methodcaller("head")), + (pd.Series, ([1] * 10,), operator.methodcaller("tail")), + (pd.DataFrame, ({"A": [1] * 10},), operator.methodcaller("tail")), + (pd.Series, ([1, 2],), operator.methodcaller("sample", n=2, replace=True)), + (pd.DataFrame, (frame_data,), operator.methodcaller("sample", n=2, replace=True)), + (pd.Series, ([1, 2],), operator.methodcaller("astype", float)), + (pd.DataFrame, frame_data, operator.methodcaller("astype", float)), + (pd.Series, ([1, 2],), operator.methodcaller("copy")), + (pd.DataFrame, frame_data, operator.methodcaller("copy")), + (pd.Series, ([1, 2], None, object), operator.methodcaller("infer_objects")), + ( + pd.DataFrame, + ({"A": np.array([1, 2], dtype=object)},), + operator.methodcaller("infer_objects"), + ), + (pd.Series, ([1, 2],), operator.methodcaller("convert_dtypes")), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("convert_dtypes")), + marks=not_implemented_mark, + ), + (pd.Series, ([1, None, 3],), operator.methodcaller("interpolate")), + (pd.DataFrame, ({"A": [1, None, 3]},), operator.methodcaller("interpolate")), + (pd.Series, ([1, 2],), operator.methodcaller("clip", lower=1)), + (pd.DataFrame, frame_data, operator.methodcaller("clip", lower=1)), + ( + pd.Series, + (1, pd.date_range("2000", periods=4)), + operator.methodcaller("asfreq", "H"), + ), + ( + pd.DataFrame, + ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), + operator.methodcaller("asfreq", "H"), + ), + ( + pd.Series, + (1, pd.date_range("2000", periods=4)), + operator.methodcaller("at_time", "12:00"), + ), + ( + pd.DataFrame, + ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), + operator.methodcaller("at_time", "12:00"), + ), + ( + pd.Series, + (1, pd.date_range("2000", periods=4)), + operator.methodcaller("between_time", "12:00", "13:00"), + ), + ( + pd.DataFrame, + ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), + operator.methodcaller("between_time", "12:00", "13:00"), + ), + ( + pd.Series, + (1, pd.date_range("2000", periods=4)), + operator.methodcaller("first", "3D"), + ), + ( + pd.DataFrame, + ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), + operator.methodcaller("first", "3D"), + ), + ( + pd.Series, + (1, pd.date_range("2000", periods=4)), + operator.methodcaller("last", "3D"), + ), + ( + pd.DataFrame, + ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), + operator.methodcaller("last", "3D"), + ), + (pd.Series, ([1, 2],), operator.methodcaller("rank")), + (pd.DataFrame, frame_data, operator.methodcaller("rank")), + (pd.Series, ([1, 2],), operator.methodcaller("where", np.array([True, False]))), + (pd.DataFrame, frame_data, operator.methodcaller("where", np.array([[True]]))), + (pd.Series, ([1, 2],), operator.methodcaller("mask", np.array([True, False]))), + (pd.DataFrame, frame_data, operator.methodcaller("mask", np.array([[True]]))), + (pd.Series, ([1, 2],), operator.methodcaller("slice_shift")), + (pd.DataFrame, frame_data, operator.methodcaller("slice_shift")), + (pd.Series, (1, pd.date_range("2000", periods=4)), operator.methodcaller("tshift")), + ( + pd.DataFrame, + ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), + operator.methodcaller("tshift"), + ), + (pd.Series, ([1, 2],), operator.methodcaller("truncate", before=0)), + (pd.DataFrame, frame_data, operator.methodcaller("truncate", before=0)), + ( + pd.Series, + (1, pd.date_range("2000", periods=4, tz="UTC")), + operator.methodcaller("tz_convert", "CET"), + ), + ( + pd.DataFrame, + ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4, tz="UTC")), + operator.methodcaller("tz_convert", "CET"), + ), + ( + pd.Series, + (1, pd.date_range("2000", periods=4)), + operator.methodcaller("tz_localize", "CET"), + ), + ( + pd.DataFrame, + ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), + operator.methodcaller("tz_localize", "CET"), + ), + pytest.param( + (pd.Series, ([1, 2],), operator.methodcaller("describe")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("describe")), + marks=not_implemented_mark, + ), + (pd.Series, ([1, 2],), operator.methodcaller("pct_change")), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("pct_change")), + marks=not_implemented_mark, + ), + (pd.Series, ([1],), operator.methodcaller("transform", lambda x: x - x.min())), + pytest.param( + ( + pd.DataFrame, + frame_mi_data, + operator.methodcaller("transform", lambda x: x - x.min()), + ), + marks=not_implemented_mark, + ), + (pd.Series, ([1],), operator.methodcaller("apply", lambda x: x)), + pytest.param( + (pd.DataFrame, frame_mi_data, operator.methodcaller("apply", lambda x: x)), + marks=not_implemented_mark, + ), + # Cumulative reductions + (pd.Series, ([1],), operator.methodcaller("cumsum")), + (pd.DataFrame, frame_data, operator.methodcaller("cumsum")), + # Reductions + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("any")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("sum")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("std")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("mean")), + marks=not_implemented_mark, + ), +] + + +def idfn(x): + xpr = re.compile(r"'(.*)?'") + m = xpr.search(str(x)) + if m: + return m.group(1) + else: + return str(x) + + +@pytest.fixture(params=_all_methods, ids=lambda x: idfn(x[-1])) +def ndframe_method(request): + """ + An NDFrame method returning an NDFrame. + """ + return request.param + + +def test_finalize_called(ndframe_method): + cls, init_args, method = ndframe_method + ndframe = cls(*init_args) + + ndframe.attrs = {"a": 1} + result = method(ndframe) + + assert result.attrs == {"a": 1} + + +# ---------------------------------------------------------------------------- +# Binary operations + + +@pytest.mark.parametrize("annotate", ["left", "right", "both"]) +@pytest.mark.parametrize( + "args", + [ + (1, pd.Series([1])), + (1, pd.DataFrame({"A": [1]})), + (pd.Series([1]), 1), + (pd.DataFrame({"A": [1]}), 1), + (pd.Series([1]), pd.Series([1])), + (pd.DataFrame({"A": [1]}), pd.DataFrame({"A": [1]})), + (pd.Series([1]), pd.DataFrame({"A": [1]})), + (pd.DataFrame({"A": [1]}), pd.Series([1])), + ], +) +def test_binops(args, annotate, all_arithmetic_functions): + # This generates 326 tests... Is that needed? + left, right = args + if annotate == "both" and isinstance(left, int) or isinstance(right, int): + return + + if isinstance(left, pd.DataFrame) or isinstance(right, pd.DataFrame): + pytest.xfail(reason="not implemented") + + if annotate in {"left", "both"} and not isinstance(left, int): + left.attrs = {"a": 1} + if annotate in {"left", "both"} and not isinstance(right, int): + right.attrs = {"a": 1} + + result = all_arithmetic_functions(left, right) + assert result.attrs == {"a": 1} + + +# ---------------------------------------------------------------------------- +# Accessors + + +@pytest.mark.parametrize( + "method", + [ + operator.methodcaller("capitalize"), + operator.methodcaller("casefold"), + pytest.param( + operator.methodcaller("cat", ["a"]), + marks=pytest.mark.xfail(reason="finalize not called."), + ), + operator.methodcaller("contains", "a"), + operator.methodcaller("count", "a"), + operator.methodcaller("encode", "utf-8"), + operator.methodcaller("endswith", "a"), + pytest.param( + operator.methodcaller("extract", r"(\w)(\d)"), + marks=pytest.mark.xfail(reason="finalize not called."), + ), + pytest.param( + operator.methodcaller("extract", r"(\w)(\d)"), + marks=pytest.mark.xfail(reason="finalize not called."), + ), + operator.methodcaller("find", "a"), + operator.methodcaller("findall", "a"), + operator.methodcaller("get", 0), + operator.methodcaller("index", "a"), + operator.methodcaller("len"), + operator.methodcaller("ljust", 4), + operator.methodcaller("lower"), + operator.methodcaller("lstrip"), + operator.methodcaller("match", r"\w"), + operator.methodcaller("normalize", "NFC"), + operator.methodcaller("pad", 4), + operator.methodcaller("partition", "a"), + operator.methodcaller("repeat", 2), + operator.methodcaller("replace", "a", "b"), + operator.methodcaller("rfind", "a"), + operator.methodcaller("rindex", "a"), + operator.methodcaller("rjust", 4), + operator.methodcaller("rpartition", "a"), + operator.methodcaller("rstrip"), + operator.methodcaller("slice", 4), + operator.methodcaller("slice_replace", 1, repl="a"), + operator.methodcaller("startswith", "a"), + operator.methodcaller("strip"), + operator.methodcaller("swapcase"), + operator.methodcaller("translate", {"a": "b"}), + operator.methodcaller("upper"), + operator.methodcaller("wrap", 4), + operator.methodcaller("zfill", 4), + operator.methodcaller("isalnum"), + operator.methodcaller("isalpha"), + operator.methodcaller("isdigit"), + operator.methodcaller("isspace"), + operator.methodcaller("islower"), + operator.methodcaller("isupper"), + operator.methodcaller("istitle"), + operator.methodcaller("isnumeric"), + operator.methodcaller("isdecimal"), + operator.methodcaller("get_dummies"), + ], + ids=idfn, +) +@not_implemented_mark +def test_string_method(method): + s = pd.Series(["a1"]) + s.attrs = {"a": 1} + result = method(s.str) + assert result.attrs == {"a": 1} + + +@pytest.mark.parametrize( + "method", + [ + operator.methodcaller("to_period"), + operator.methodcaller("tz_localize", "CET"), + operator.methodcaller("normalize"), + operator.methodcaller("strftime", "%Y"), + operator.methodcaller("round", "H"), + operator.methodcaller("floor", "H"), + operator.methodcaller("ceil", "H"), + operator.methodcaller("month_name"), + operator.methodcaller("day_name"), + ], + ids=idfn, +) +@not_implemented_mark +def test_datetime_method(method): + s = pd.Series(pd.date_range("2000", periods=4)) + s.attrs = {"a": 1} + result = method(s.dt) + assert result.attrs == {"a": 1} + + +@pytest.mark.parametrize( + "attr", + [ + "date", + "time", + "timetz", + "year", + "month", + "day", + "hour", + "minute", + "second", + "microsecond", + "nanosecond", + "week", + "weekofyear", + "dayofweek", + "dayofyear", + "quarter", + "is_month_start", + "is_month_end", + "is_quarter_start", + "is_quarter_end", + "is_year_start", + "is_year_end", + "is_leap_year", + "daysinmonth", + "days_in_month", + ], +) +@not_implemented_mark +def test_datetime_property(attr): + s = pd.Series(pd.date_range("2000", periods=4)) + s.attrs = {"a": 1} + result = getattr(s.dt, attr) + assert result.attrs == {"a": 1} + + +@pytest.mark.parametrize( + "attr", ["days", "seconds", "microseconds", "nanoseconds", "components"] +) +@not_implemented_mark +def test_timedelta_property(attr): + s = pd.Series(pd.timedelta_range("2000", periods=4)) + s.attrs = {"a": 1} + result = getattr(s.dt, attr) + assert result.attrs == {"a": 1} + + +@pytest.mark.parametrize( + "method", [operator.methodcaller("total_seconds")], +) +@not_implemented_mark +def test_timedelta_methods(method): + s = pd.Series(pd.timedelta_range("2000", periods=4)) + s.attrs = {"a": 1} + result = method(s.dt) + assert result.attrs == {"a": 1} + + +@pytest.mark.parametrize( + "method", + [ + operator.methodcaller("add_categories", ["c"]), + operator.methodcaller("as_ordered"), + operator.methodcaller("as_unordered"), + lambda x: getattr(x, "codes"), + operator.methodcaller("remove_categories", "a"), + operator.methodcaller("remove_unused_categories"), + operator.methodcaller("rename_categories", {"a": "A", "b": "B"}), + operator.methodcaller("reorder_categories", ["b", "a"]), + operator.methodcaller("set_categories", ["A", "B"]), + ], +) +@not_implemented_mark +def test_categorical_accessor(method): + s = pd.Series(["a", "b"], dtype="category") + s.attrs = {"a": 1} + result = method(s.cat) + assert result.attrs == {"a": 1} + + +# ---------------------------------------------------------------------------- +# Groupby + + +@pytest.mark.parametrize( + "obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})] +) +@pytest.mark.parametrize( + "method", + [ + operator.methodcaller("sum"), + lambda x: x.agg("sum"), + lambda x: x.agg(["sum", "count"]), + lambda x: x.transform(lambda y: y), + lambda x: x.apply(lambda y: y), + ], +) +@not_implemented_mark +def test_groupby(obj, method): + obj.attrs = {"a": 1} + result = method(obj.groupby([0, 0])) + assert result.attrs == {"a": 1} diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index dca65152e82db..31501f20db453 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -1,25 +1,15 @@ from copy import deepcopy -from distutils.version import LooseVersion from operator import methodcaller import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import DataFrame, MultiIndex, Series, date_range import pandas._testing as tm from .test_generic import Generic -try: - import xarray - - _XARRAY_INSTALLED = True -except ImportError: - _XARRAY_INSTALLED = False - class TestDataFrame(Generic): _typ = DataFrame @@ -72,9 +62,10 @@ def test_nonzero_single_element(self): assert not df.bool() df = DataFrame([[False, False]]) - with pytest.raises(ValueError): + msg = "The truth value of a DataFrame is ambiguous" + with pytest.raises(ValueError, match=msg): df.bool() - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): bool(df) def test_get_numeric_data_preserve_dtype(self): @@ -189,30 +180,31 @@ class TestDataFrame2: def test_validate_bool_args(self, value): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - with pytest.raises(ValueError): + msg = 'For argument "inplace" expected type bool, received type' + with pytest.raises(ValueError, match=msg): super(DataFrame, df).rename_axis( mapper={"a": "x", "b": "y"}, axis=1, inplace=value ) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): super(DataFrame, df).drop("a", axis=1, inplace=value) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): super(DataFrame, df)._consolidate(inplace=value) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): super(DataFrame, df).fillna(value=0, inplace=value) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): super(DataFrame, df).replace(to_replace=1, value=7, inplace=value) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): super(DataFrame, df).interpolate(inplace=value) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): super(DataFrame, df)._where(cond=df.a > 2, inplace=value) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): super(DataFrame, df).mask(cond=df.a > 2, inplace=value) def test_unexpected_keyword(self): @@ -222,102 +214,15 @@ def test_unexpected_keyword(self): ts = df["joe"].copy() ts[2] = np.nan - with pytest.raises(TypeError, match="unexpected keyword"): + msg = "unexpected keyword" + with pytest.raises(TypeError, match=msg): df.drop("joe", axis=1, in_place=True) - with pytest.raises(TypeError, match="unexpected keyword"): + with pytest.raises(TypeError, match=msg): df.reindex([1, 0], inplace=True) - with pytest.raises(TypeError, match="unexpected keyword"): + with pytest.raises(TypeError, match=msg): ca.fillna(0, inplace=True) - with pytest.raises(TypeError, match="unexpected keyword"): + with pytest.raises(TypeError, match=msg): ts.fillna(0, in_place=True) - - -class TestToXArray: - @pytest.mark.skipif( - not _XARRAY_INSTALLED - or _XARRAY_INSTALLED - and LooseVersion(xarray.__version__) < LooseVersion("0.10.0"), - reason="xarray >= 0.10.0 required", - ) - @pytest.mark.parametrize("index", tm.all_index_generator(3)) - def test_to_xarray_index_types(self, index): - from xarray import Dataset - - df = DataFrame( - { - "a": list("abc"), - "b": list(range(1, 4)), - "c": np.arange(3, 6).astype("u1"), - "d": np.arange(4.0, 7.0, dtype="float64"), - "e": [True, False, True], - "f": pd.Categorical(list("abc")), - "g": pd.date_range("20130101", periods=3), - "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), - } - ) - - df.index = index - df.index.name = "foo" - df.columns.name = "bar" - result = df.to_xarray() - assert result.dims["foo"] == 3 - assert len(result.coords) == 1 - assert len(result.data_vars) == 8 - tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) - assert isinstance(result, Dataset) - - # idempotency - # categoricals are not preserved - # datetimes w/tz are preserved - # column names are lost - expected = df.copy() - expected["f"] = expected["f"].astype(object) - expected.columns.name = None - tm.assert_frame_equal( - result.to_dataframe(), - expected, - check_index_type=False, - check_categorical=False, - ) - - @td.skip_if_no("xarray", min_version="0.7.0") - def test_to_xarray(self): - from xarray import Dataset - - df = DataFrame( - { - "a": list("abc"), - "b": list(range(1, 4)), - "c": np.arange(3, 6).astype("u1"), - "d": np.arange(4.0, 7.0, dtype="float64"), - "e": [True, False, True], - "f": pd.Categorical(list("abc")), - "g": pd.date_range("20130101", periods=3), - "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), - } - ) - - df.index.name = "foo" - result = df[0:0].to_xarray() - assert result.dims["foo"] == 0 - assert isinstance(result, Dataset) - - # available in 0.7.1 - # MultiIndex - df.index = pd.MultiIndex.from_product([["a"], range(3)], names=["one", "two"]) - result = df.to_xarray() - assert result.dims["one"] == 1 - assert result.dims["two"] == 3 - assert len(result.coords) == 2 - assert len(result.data_vars) == 8 - tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"]) - assert isinstance(result, Dataset) - - result = result.to_dataframe() - expected = df.copy() - expected["f"] = expected["f"].astype(object) - expected.columns.name = None - tm.assert_frame_equal(result, expected, check_index_type=False) diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 1b6cb8447c76d..2c8261a6dcc5a 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -3,11 +3,14 @@ import numpy as np import pytest +from pandas.compat.numpy import _np_version_under1p17 + from pandas.core.dtypes.common import is_scalar import pandas as pd from pandas import DataFrame, MultiIndex, Series, date_range import pandas._testing as tm +import pandas.core.common as com # ---------------------------------------------------------------------- # Generic types test cases @@ -159,27 +162,14 @@ def test_downcast(self): o = self._construct(shape=4, value=9, dtype=np.int64) result = o.copy() - result._data = o._data.downcast(dtypes="infer") + result._mgr = o._mgr.downcast() self._compare(result, o) - o = self._construct(shape=4, value=9.0) - expected = o.astype(np.int64) - result = o.copy() - result._data = o._data.downcast(dtypes="infer") - self._compare(result, expected) - o = self._construct(shape=4, value=9.5) result = o.copy() - result._data = o._data.downcast(dtypes="infer") + result._mgr = o._mgr.downcast() self._compare(result, o) - # are close - o = self._construct(shape=4, value=9.000000000005) - result = o.copy() - result._data = o._data.downcast(dtypes="infer") - expected = o.astype(np.int64) - self._compare(result, expected) - def test_constructor_compound_dtypes(self): # see gh-5191 # Compound dtypes should raise NotImplementedError. @@ -259,14 +249,13 @@ def test_metadata_propagation(self): self.check_metadata(v1 & v2) self.check_metadata(v1 | v2) - @pytest.mark.parametrize("index", tm.all_index_generator(10)) - def test_head_tail(self, index): + def test_head_tail(self, indices): # GH5370 - o = self._construct(shape=10) + o = self._construct(shape=len(indices)) axis = o._get_axis_name(0) - setattr(o, axis, index) + setattr(o, axis, indices) o.head() @@ -282,8 +271,8 @@ def test_head_tail(self, index): self._compare(o.tail(len(o) + 1), o) # neg index - self._compare(o.head(-3), o.head(7)) - self._compare(o.tail(-3), o.tail(7)) + self._compare(o.head(-3), o.head(len(indices) - 3)) + self._compare(o.tail(-3), o.tail(len(indices) - 3)) def test_sample(self): # Fixes issue: 2419 @@ -654,6 +643,29 @@ def test_sample(sel): with pytest.raises(ValueError): df.sample(1, weights=s4) + @pytest.mark.parametrize( + "func_str,arg", + [ + ("np.array", [2, 3, 1, 0]), + pytest.param( + "np.random.MT19937", + 3, + marks=pytest.mark.skipif(_np_version_under1p17, reason="NumPy<1.17"), + ), + pytest.param( + "np.random.PCG64", + 11, + marks=pytest.mark.skipif(_np_version_under1p17, reason="NumPy<1.17"), + ), + ], + ) + def test_sample_random_state(self, func_str, arg): + # GH32503 + df = pd.DataFrame({"col1": range(10, 20), "col2": range(20, 30)}) + result = df.sample(n=3, random_state=eval(func_str)(arg)) + expected = df.sample(n=3, random_state=com.random_state(eval(func_str)(arg))) + tm.assert_frame_equal(result, expected) + def test_squeeze(self): # noop for s in [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries()]: @@ -679,10 +691,10 @@ def test_squeeze(self): tm.assert_series_equal(df.squeeze(axis=1), df.iloc[:, 0]) tm.assert_series_equal(df.squeeze(axis="columns"), df.iloc[:, 0]) assert df.squeeze() == df.iloc[0, 0] - msg = "No axis named 2 for object type " + msg = "No axis named 2 for object type DataFrame" with pytest.raises(ValueError, match=msg): df.squeeze(axis=2) - msg = "No axis named x for object type " + msg = "No axis named x for object type DataFrame" with pytest.raises(ValueError, match=msg): df.squeeze(axis="x") diff --git a/pandas/tests/generic/test_series.py b/pandas/tests/generic/test_series.py index f119eb422a276..07c02330d85ce 100644 --- a/pandas/tests/generic/test_series.py +++ b/pandas/tests/generic/test_series.py @@ -1,24 +1,14 @@ -from distutils.version import LooseVersion from operator import methodcaller import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import MultiIndex, Series, date_range import pandas._testing as tm from .test_generic import Generic -try: - import xarray - - _XARRAY_INSTALLED = True -except ImportError: - _XARRAY_INSTALLED = False - class TestSeries(Generic): _typ = Series @@ -31,15 +21,6 @@ def test_rename_mi(self): ) s.rename(str.lower) - @pytest.mark.parametrize("func", ["rename_axis", "_set_axis_name"]) - def test_set_axis_name(self, func): - s = Series([1, 2, 3], index=["a", "b", "c"]) - name = "foo" - - result = methodcaller(func, name)(s) - assert s.index.name is None - assert result.index.name == name - @pytest.mark.parametrize("func", ["rename_axis", "_set_axis_name"]) def test_set_axis_name_mi(self, func): s = Series( @@ -57,7 +38,8 @@ def test_set_axis_name_mi(self, func): def test_set_axis_name_raises(self): s = pd.Series([1]) - with pytest.raises(ValueError): + msg = "No axis named 1 for object type Series" + with pytest.raises(ValueError, match=msg): s._set_axis_name(name="a", axis=1) def test_get_numeric_data_preserve_dtype(self): @@ -176,6 +158,9 @@ def finalize(self, other, method=None, **kwargs): Series._metadata = _metadata Series.__finalize__ = _finalize # FIXME: use monkeypatch + +class TestSeries2: + # Separating off because it doesnt rely on parent class @pytest.mark.parametrize( "s", [ @@ -194,72 +179,3 @@ def test_datetime_shift_always_copy(self, move_by_freq): # GH22397 s = pd.Series(range(5), index=pd.date_range("2017", periods=5)) assert s.shift(freq=move_by_freq) is not s - - -class TestSeries2: - # moved from Generic - def test_get_default(self): - - # GH#7725 - d0 = ["a", "b", "c", "d"] - d1 = np.arange(4, dtype="int64") - others = ["e", 10] - - for data, index in ((d0, d1), (d1, d0)): - s = Series(data, index=index) - for i, d in zip(index, data): - assert s.get(i) == d - assert s.get(i, d) == d - assert s.get(i, "z") == d - for other in others: - assert s.get(other, "z") == "z" - assert s.get(other, other) == other - - -class TestToXArray: - @pytest.mark.skipif( - not _XARRAY_INSTALLED - or _XARRAY_INSTALLED - and LooseVersion(xarray.__version__) < LooseVersion("0.10.0"), - reason="xarray >= 0.10.0 required", - ) - @pytest.mark.parametrize("index", tm.all_index_generator(6)) - def test_to_xarray_index_types(self, index): - from xarray import DataArray - - s = Series(range(6), index=index) - s.index.name = "foo" - result = s.to_xarray() - repr(result) - assert len(result) == 6 - assert len(result.coords) == 1 - tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) - assert isinstance(result, DataArray) - - # idempotency - tm.assert_series_equal( - result.to_series(), s, check_index_type=False, check_categorical=True - ) - - @td.skip_if_no("xarray", min_version="0.7.0") - def test_to_xarray(self): - from xarray import DataArray - - s = Series([], dtype=object) - s.index.name = "foo" - result = s.to_xarray() - assert len(result) == 0 - assert len(result.coords) == 1 - tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) - assert isinstance(result, DataArray) - - s = Series(range(6)) - s.index.name = "foo" - s.index = pd.MultiIndex.from_product( - [["a", "b"], range(3)], names=["one", "two"] - ) - result = s.to_xarray() - assert len(result) == 2 - tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"]) - assert isinstance(result, DataArray) - tm.assert_series_equal(result.to_series(), s) diff --git a/pandas/tests/generic/test_to_xarray.py b/pandas/tests/generic/test_to_xarray.py new file mode 100644 index 0000000000000..b6abdf09a7f62 --- /dev/null +++ b/pandas/tests/generic/test_to_xarray.py @@ -0,0 +1,135 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame, Series +import pandas._testing as tm + + +class TestDataFrameToXArray: + @td.skip_if_no("xarray", "0.10.0") + def test_to_xarray_index_types(self, indices): + if isinstance(indices, pd.MultiIndex): + pytest.skip("MultiIndex is tested separately") + if len(indices) == 0: + pytest.skip("Test doesn't make sense for empty index") + + from xarray import Dataset + + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + } + ) + + df.index = indices[:3] + df.index.name = "foo" + df.columns.name = "bar" + result = df.to_xarray() + assert result.dims["foo"] == 3 + assert len(result.coords) == 1 + assert len(result.data_vars) == 8 + tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) + assert isinstance(result, Dataset) + + # idempotency + # datetimes w/tz are preserved + # column names are lost + expected = df.copy() + expected["f"] = expected["f"].astype(object) + expected.columns.name = None + tm.assert_frame_equal( + result.to_dataframe(), expected, + ) + + @td.skip_if_no("xarray", min_version="0.7.0") + def test_to_xarray(self): + from xarray import Dataset + + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + } + ) + + df.index.name = "foo" + result = df[0:0].to_xarray() + assert result.dims["foo"] == 0 + assert isinstance(result, Dataset) + + # available in 0.7.1 + # MultiIndex + df.index = pd.MultiIndex.from_product([["a"], range(3)], names=["one", "two"]) + result = df.to_xarray() + assert result.dims["one"] == 1 + assert result.dims["two"] == 3 + assert len(result.coords) == 2 + assert len(result.data_vars) == 8 + tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"]) + assert isinstance(result, Dataset) + + result = result.to_dataframe() + expected = df.copy() + expected["f"] = expected["f"].astype(object) + expected.columns.name = None + tm.assert_frame_equal(result, expected, check_index_type=False) + + +class TestSeriesToXArray: + @td.skip_if_no("xarray", "0.10.0") + def test_to_xarray_index_types(self, indices): + if isinstance(indices, pd.MultiIndex): + pytest.skip("MultiIndex is tested separately") + + from xarray import DataArray + + s = Series(range(len(indices)), index=indices) + s.index.name = "foo" + result = s.to_xarray() + repr(result) + assert len(result) == len(indices) + assert len(result.coords) == 1 + tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) + assert isinstance(result, DataArray) + + # idempotency + tm.assert_series_equal(result.to_series(), s, check_index_type=False) + + @td.skip_if_no("xarray", min_version="0.7.0") + def test_to_xarray(self): + from xarray import DataArray + + s = Series([], dtype=object) + s.index.name = "foo" + result = s.to_xarray() + assert len(result) == 0 + assert len(result.coords) == 1 + tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) + assert isinstance(result, DataArray) + + s = Series(range(6)) + s.index.name = "foo" + s.index = pd.MultiIndex.from_product( + [["a", "b"], range(3)], names=["one", "two"] + ) + result = s.to_xarray() + assert len(result) == 2 + tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"]) + assert isinstance(result, DataArray) + tm.assert_series_equal(result.to_series(), s) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 48f8de7e51ae4..e860ea1a3d052 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas.core.dtypes.common import is_integer_dtype + import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, concat import pandas._testing as tm @@ -340,6 +342,30 @@ def test_groupby_agg_coercing_bools(): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "op", + [ + lambda x: x.sum(), + lambda x: x.cumsum(), + lambda x: x.transform("sum"), + lambda x: x.transform("cumsum"), + lambda x: x.agg("sum"), + lambda x: x.agg("cumsum"), + ], +) +def test_bool_agg_dtype(op): + # GH 7001 + # Bool sum aggregations result in int + df = pd.DataFrame({"a": [1, 1], "b": [False, True]}) + s = df.set_index("a")["b"] + + result = op(df.groupby("a"))["b"].dtype + assert is_integer_dtype(result) + + result = op(s.groupby("a")).dtype + assert is_integer_dtype(result) + + def test_order_aggregate_multiple_funcs(): # GH 25692 df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]}) @@ -691,6 +717,19 @@ def test_agg_relabel_multiindex_duplicates(): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "func", [lambda s: s.mean(), lambda s: np.mean(s), lambda s: np.nanmean(s)] +) +def test_multiindex_custom_func(func): + # GH 31777 + data = [[1, 4, 2], [5, 7, 1]] + df = pd.DataFrame(data, columns=pd.MultiIndex.from_arrays([[1, 1, 2], [3, 4, 3]])) + result = df.groupby(np.array([0, 1])).agg(func) + expected_dict = {(1, 3): {0: 1, 1: 5}, (1, 4): {0: 4, 1: 7}, (2, 3): {0: 2, 1: 1}} + expected = pd.DataFrame(expected_dict) + tm.assert_frame_equal(result, expected) + + def myfunc(s): return np.percentile(s, q=0.90) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 52ee3e652501c..264cf40dc6984 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -209,7 +209,7 @@ def test_aggregate_api_consistency(): expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1) expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]]) - msg = r"nested renamer is not supported" + msg = r"Column\(s\) \['r', 'r2'\] do not exist" with pytest.raises(SpecificationError, match=msg): grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean}) @@ -224,9 +224,11 @@ def test_agg_dict_renaming_deprecation(): {"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}} ) + msg = r"Column\(s\) \['ma'\] do not exist" with pytest.raises(SpecificationError, match=msg): df.groupby("A")[["B", "C"]].agg({"ma": "max"}) + msg = r"nested renamer is not supported" with pytest.raises(SpecificationError, match=msg): df.groupby("A").B.agg({"foo": "count"}) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 18ad5d90b3f60..9fcbabb07857e 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -866,3 +866,38 @@ def fct(group): [[1.0, 2.0], [3.0], [np.nan]], index=pd.Index(["a", "b", "none"], name="A") ) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "function", [lambda gr: gr.index, lambda gr: gr.index + 1 - 1], +) +def test_apply_function_index_return(function): + # GH: 22541 + df = pd.DataFrame([1, 2, 2, 2, 1, 2, 3, 1, 3, 1], columns=["id"]) + result = df.groupby("id").apply(function) + expected = pd.Series( + [pd.Index([0, 4, 7, 9]), pd.Index([1, 2, 3, 5]), pd.Index([6, 8])], + index=pd.Index([1, 2, 3], name="id"), + ) + tm.assert_series_equal(result, expected) + + +def test_apply_function_with_indexing(): + # GH: 33058 + df = pd.DataFrame( + {"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]} + ) + + def fn(x): + x.col2[x.index[-1]] = 0 + return x.col2 + + result = df.groupby(["col1"], as_index=False).apply(fn) + expected = pd.Series( + [1, 2, 0, 4, 5, 0], + index=pd.MultiIndex.from_tuples( + [(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5)] + ), + name="col2", + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_apply_mutate.py b/pandas/tests/groupby/test_apply_mutate.py new file mode 100644 index 0000000000000..529f76bf692ce --- /dev/null +++ b/pandas/tests/groupby/test_apply_mutate.py @@ -0,0 +1,70 @@ +import numpy as np + +import pandas as pd +import pandas._testing as tm + + +def test_mutate_groups(): + + # GH3380 + + df = pd.DataFrame( + { + "cat1": ["a"] * 8 + ["b"] * 6, + "cat2": ["c"] * 2 + + ["d"] * 2 + + ["e"] * 2 + + ["f"] * 2 + + ["c"] * 2 + + ["d"] * 2 + + ["e"] * 2, + "cat3": [f"g{x}" for x in range(1, 15)], + "val": np.random.randint(100, size=14), + } + ) + + def f_copy(x): + x = x.copy() + x["rank"] = x.val.rank(method="min") + return x.groupby("cat2")["rank"].min() + + def f_no_copy(x): + x["rank"] = x.val.rank(method="min") + return x.groupby("cat2")["rank"].min() + + grpby_copy = df.groupby("cat1").apply(f_copy) + grpby_no_copy = df.groupby("cat1").apply(f_no_copy) + tm.assert_series_equal(grpby_copy, grpby_no_copy) + + +def test_no_mutate_but_looks_like(): + + # GH 8467 + # first show's mutation indicator + # second does not, but should yield the same results + df = pd.DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) + + result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) + result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) + tm.assert_series_equal(result1, result2) + + +def test_apply_function_with_indexing(): + # GH: 33058 + df = pd.DataFrame( + {"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]} + ) + + def fn(x): + x.col2[x.index[-1]] = 0 + return x.col2 + + result = df.groupby(["col1"], as_index=False).apply(fn) + expected = pd.Series( + [1, 2, 0, 4, 5, 0], + index=pd.MultiIndex.from_tuples( + [(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5)] + ), + name="col2", + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index ff74d374e5e3f..e999b88fccb08 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -5,6 +5,7 @@ from pandas.core.dtypes.common import ensure_int64 +import pandas as pd from pandas import Index, Series, isna import pandas._testing as tm @@ -51,6 +52,30 @@ def test_series_bin_grouper(): tm.assert_almost_equal(counts, exp_counts) +def assert_block_lengths(x): + assert len(x) == len(x._mgr.blocks[0].mgr_locs) + return 0 + + +def cumsum_max(x): + x.cumsum().max() + return 0 + + +@pytest.mark.parametrize("func", [cumsum_max, assert_block_lengths]) +def test_mgr_locs_updated(func): + # https://github.com/pandas-dev/pandas/issues/31802 + # Some operations may require creating new blocks, which requires + # valid mgr_locs + df = pd.DataFrame({"A": ["a", "a", "a"], "B": ["a", "b", "b"], "C": [1, 1, 1]}) + result = df.groupby(["A", "B"]).agg(func) + expected = pd.DataFrame( + {"C": [0, 0]}, + index=pd.MultiIndex.from_product([["a"], ["a", "b"]], names=["A", "B"]), + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "binner,closed,expected", [ diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 9b07269811d8e..da8327f64e26f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1226,10 +1226,10 @@ def test_groupby_categorical_axis_1(code): tm.assert_frame_equal(result, expected) -def test_groupby_cat_preserves_structure(observed, ordered_fixture): +def test_groupby_cat_preserves_structure(observed, ordered): # GH 28787 df = DataFrame( - {"Name": Categorical(["Bob", "Greg"], ordered=ordered_fixture), "Item": [1, 2]}, + {"Name": Categorical(["Bob", "Greg"], ordered=ordered), "Item": [1, 2]}, columns=["Name", "Item"], ) expected = df.copy() @@ -1262,6 +1262,9 @@ def test_series_groupby_on_2_categoricals_unobserved( if reduction_func == "ngroup": pytest.skip("ngroup is not truly a reduction") + if reduction_func == "corrwith": # GH 32293 + pytest.xfail("TODO: implemented SeriesGroupBy.corrwith") + df = pd.DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABCD")), @@ -1377,3 +1380,15 @@ def test_groupby_agg_non_numeric(): result = df.groupby([1, 2, 1]).nunique() tm.assert_frame_equal(result, expected) + + +def test_read_only_category_no_sort(): + # GH33410 + cats = np.array([1, 2]) + cats.flags.writeable = False + df = DataFrame( + {"a": [1, 3, 5, 7], "b": Categorical([1, 1, 2, 2], categories=Index(cats))} + ) + expected = DataFrame(data={"a": [2, 6]}, index=CategoricalIndex([1, 2], name="b")) + result = df.groupby("b", sort=False).mean() + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index b4239d7d34a90..56a18757da6e7 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas import DataFrame, MultiIndex, Period, Series, Timedelta, Timestamp +from pandas import DataFrame, Index, MultiIndex, Period, Series, Timedelta, Timestamp import pandas._testing as tm @@ -220,3 +220,12 @@ def test_count_with_only_nans_in_first_group(self): mi = MultiIndex(levels=[[], ["a", "b"]], codes=[[], []], names=["A", "B"]) expected = Series([], index=mi, dtype=np.int64, name="C") tm.assert_series_equal(result, expected, check_index_type=False) + + def test_count_groupby_column_with_nan_in_groupby_column(self): + # https://github.com/pandas-dev/pandas/issues/32841 + df = DataFrame({"A": [1, 1, 1, 1, 1], "B": [5, 4, np.NaN, 3, 0]}) + res = df.groupby(["B"]).count() + expected = DataFrame( + index=Index([0.0, 3.0, 4.0, 5.0], name="B"), data={"A": [1, 1, 1, 1]} + ) + tm.assert_frame_equal(expected, res) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index c402ca194648f..346de55f551df 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1,7 +1,6 @@ import builtins import datetime as dt from io import StringIO -from itertools import product from string import ascii_lowercase import numpy as np @@ -662,7 +661,7 @@ def test_nlargest_mi_grouper(): ] expected = Series(exp_values, index=exp_idx) - tm.assert_series_equal(result, expected, check_exact=False, check_less_precise=True) + tm.assert_series_equal(result, expected, check_exact=False) def test_nsmallest(): @@ -1296,36 +1295,32 @@ def __eq__(self, other): # -------------------------------- -def test_size(df): - grouped = df.groupby(["A", "B"]) +@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]]) +def test_size(df, by): + grouped = df.groupby(by=by) result = grouped.size() for key, group in grouped: assert result[key] == len(group) - grouped = df.groupby("A") - result = grouped.size() - for key, group in grouped: - assert result[key] == len(group) - grouped = df.groupby("B") - result = grouped.size() - for key, group in grouped: - assert result[key] == len(group) +@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]]) +@pytest.mark.parametrize("sort", [True, False]) +def test_size_sort(df, sort, by): + df = DataFrame(np.random.choice(20, (1000, 3)), columns=list("ABC")) + left = df.groupby(by=by, sort=sort).size() + right = df.groupby(by=by, sort=sort)["C"].apply(lambda a: a.shape[0]) + tm.assert_series_equal(left, right, check_names=False) - df = DataFrame(np.random.choice(20, (1000, 3)), columns=list("abc")) - for sort, key in product((False, True), ("a", "b", ["a", "b"])): - left = df.groupby(key, sort=sort).size() - right = df.groupby(key, sort=sort)["c"].apply(lambda a: a.shape[0]) - tm.assert_series_equal(left, right, check_names=False) - # GH11699 +def test_size_series_dataframe(): + # https://github.com/pandas-dev/pandas/issues/11699 df = DataFrame(columns=["A", "B"]) out = Series(dtype="int64", index=Index([], name="A")) tm.assert_series_equal(df.groupby("A").size(), out) def test_size_groupby_all_null(): - # GH23050 + # https://github.com/pandas-dev/pandas/issues/23050 # Assert no 'Value Error : Length of passed values is 2, index implies 0' df = DataFrame({"A": [None, None]}) # all-null groups result = df.groupby("A").size() @@ -1335,6 +1330,8 @@ def test_size_groupby_all_null(): # quantile # -------------------------------- + + @pytest.mark.parametrize( "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] ) @@ -1522,6 +1519,30 @@ def test_quantile_missing_group_values_correct_results(): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "values", + [ + pd.array([1, 0, None] * 2, dtype="Int64"), + pd.array([True, False, None] * 2, dtype="boolean"), + ], +) +@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) +def test_groupby_quantile_nullable_array(values, q): + # https://github.com/pandas-dev/pandas/issues/33136 + df = pd.DataFrame({"a": ["x"] * 3 + ["y"] * 3, "b": values}) + result = df.groupby("a")["b"].quantile(q) + + if isinstance(q, list): + idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None]) + true_quantiles = [0.0, 0.5, 1.0] + else: + idx = pd.Index(["x", "y"], name="a") + true_quantiles = [0.5] + + expected = pd.Series(true_quantiles * 2, index=idx, name="b") + tm.assert_series_equal(result, expected) + + # pipe # -------------------------------- @@ -1608,3 +1629,34 @@ def test_groupby_mean_no_overflow(): } ) assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840 + + +@pytest.mark.parametrize( + "values", + [ + { + "a": [1, 1, 1, 2, 2, 2, 3, 3, 3], + "b": [1, pd.NA, 2, 1, pd.NA, 2, 1, pd.NA, 2], + }, + {"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 1, 2, 1, 2]}, + ], +) +@pytest.mark.parametrize("function", ["mean", "median", "var"]) +def test_apply_to_nullable_integer_returns_float(values, function): + # https://github.com/pandas-dev/pandas/issues/32219 + output = 0.5 if function == "var" else 1.5 + arr = np.array([output] * 3, dtype=float) + idx = pd.Index([1, 2, 3], dtype=object, name="a") + expected = pd.DataFrame({"b": arr}, index=idx) + + groups = pd.DataFrame(values, dtype="Int64").groupby("a") + + result = getattr(groups, function)() + tm.assert_frame_equal(result, expected) + + result = groups.agg(function) + tm.assert_frame_equal(result, expected) + + result = groups.agg([function]) + expected.columns = MultiIndex.from_tuples([("b", function)]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 5662d41e19885..c88d16e34eab8 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -921,51 +921,6 @@ def test_groupby_complex(): tm.assert_series_equal(result, expected) -def test_mutate_groups(): - - # GH3380 - - df = DataFrame( - { - "cat1": ["a"] * 8 + ["b"] * 6, - "cat2": ["c"] * 2 - + ["d"] * 2 - + ["e"] * 2 - + ["f"] * 2 - + ["c"] * 2 - + ["d"] * 2 - + ["e"] * 2, - "cat3": [f"g{x}" for x in range(1, 15)], - "val": np.random.randint(100, size=14), - } - ) - - def f_copy(x): - x = x.copy() - x["rank"] = x.val.rank(method="min") - return x.groupby("cat2")["rank"].min() - - def f_no_copy(x): - x["rank"] = x.val.rank(method="min") - return x.groupby("cat2")["rank"].min() - - grpby_copy = df.groupby("cat1").apply(f_copy) - grpby_no_copy = df.groupby("cat1").apply(f_no_copy) - tm.assert_series_equal(grpby_copy, grpby_no_copy) - - -def test_no_mutate_but_looks_like(): - - # GH 8467 - # first show's mutation indicator - # second does not, but should yield the same results - df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - - result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) - result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) - tm.assert_series_equal(result1, result2) - - def test_groupby_series_indexed_differently(): s1 = Series( [5.0, -9.0, 4.0, 100.0, -5.0, 55.0, 6.7], @@ -1765,7 +1720,7 @@ def test_tuple_as_grouping(): } ) - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=r"('a', 'b')"): df[["a", "b", "c"]].groupby(("a", "b")) result = df.groupby(("a", "b"))["c"].sum() diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index b1476f1059d84..e1bc058508bee 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -384,6 +384,32 @@ def test_first_last_tz_multi_column(method, ts, alpha): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "values", + [ + pd.array([True, False], dtype="boolean"), + pd.array([1, 2], dtype="Int64"), + pd.to_datetime(["2020-01-01", "2020-02-01"]), + pd.to_timedelta([1, 2], unit="D"), + ], +) +@pytest.mark.parametrize("function", ["first", "last", "min", "max"]) +def test_first_last_extension_array_keeps_dtype(values, function): + # https://github.com/pandas-dev/pandas/issues/33071 + # https://github.com/pandas-dev/pandas/issues/32194 + df = DataFrame({"a": [1, 2], "b": values}) + grouped = df.groupby("a") + idx = Index([1, 2], name="a") + expected_series = Series(values, name="b", index=idx) + expected_frame = DataFrame({"b": values}, index=idx) + + result_series = getattr(grouped["b"], function)() + tm.assert_series_equal(result_series, expected_series) + + result_frame = grouped.agg({"b": function}) + tm.assert_frame_equal(result_frame, expected_frame) + + def test_nth_multi_index_as_expected(): # PR 9090, related to issue 8979 # test nth on MultiIndex diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 6b8bd9e805a0c..7cac13efb71f3 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -214,7 +214,7 @@ def test_timegrouper_with_reg_groups(self): result = df.groupby([pd.Grouper(freq="1M", level=0), "Buyer"]).sum() tm.assert_frame_equal(result, expected) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="The level foo is not valid"): df.groupby([pd.Grouper(freq="1M", level="foo"), "Buyer"]).sum() # multi names @@ -235,7 +235,8 @@ def test_timegrouper_with_reg_groups(self): tm.assert_frame_equal(result, expected) # error as we have both a level and a name! - with pytest.raises(ValueError): + msg = "The Grouper cannot specify both a key and a level!" + with pytest.raises(ValueError, match=msg): df.groupby( [pd.Grouper(freq="1M", key="Date", level="Date"), "Buyer"] ).sum() diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 740103eec185a..2295eb2297fa6 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -327,9 +327,9 @@ def test_transform_transformation_func(transformation_func): } ) - if transformation_func in ["pad", "backfill", "tshift", "corrwith", "cumcount"]: + if transformation_func in ["pad", "backfill", "tshift", "cumcount"]: # These transformation functions are not yet covered in this test - pytest.xfail("See GH 31269 and GH 31270") + pytest.xfail("See GH 31269") elif _is_numpy_dev and transformation_func in ["cummin"]: pytest.xfail("https://github.com/pandas-dev/pandas/issues/31992") elif transformation_func == "fillna": @@ -1093,8 +1093,10 @@ def test_transform_agg_by_name(reduction_func, obj): pytest.xfail("TODO: g.transform('ngroup') doesn't work") if func == "size": # GH#27469 pytest.xfail("TODO: g.transform('size') doesn't work") + if func == "corrwith" and isinstance(obj, Series): # GH#32293 + pytest.xfail("TODO: implement SeriesGroupBy.corrwith") - args = {"nth": [0], "quantile": [0.5]}.get(func, []) + args = {"nth": [0], "quantile": [0.5], "corrwith": [obj]}.get(func, []) result = g.transform(func, *args) diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index 9e6a8f34c135d..02b32c46e7d6f 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -1,3 +1,4 @@ +import numpy as np import pytest from pandas import Index, MultiIndex @@ -7,14 +8,15 @@ class TestIndexConstructor: # Tests for the Index constructor, specifically for cases that do # not return a subclass - def test_constructor_corner(self): + @pytest.mark.parametrize("value", [1, np.int64(1)]) + def test_constructor_corner(self, value): # corner case msg = ( r"Index\(\.\.\.\) must be called with a collection of some " - "kind, 0 was passed" + f"kind, {value} was passed" ) with pytest.raises(TypeError, match=msg): - Index(0) + Index(value) @pytest.mark.parametrize("index_vals", [[("A", 1), "B"], ["B", ("A", 1)]]) def test_construction_list_mixed_tuples(self, index_vals): diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index ec3ef8050967c..77b5e2780464d 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -91,7 +91,6 @@ def test_union_sort_other_incomparable_true(self): with pytest.raises(TypeError, match=".*"): idx.union(idx[:1], sort=True) - @pytest.mark.parametrize("sort", [None, False]) def test_intersection_base(self, sort): # (same results for py2 and py3 but sortedness not tested elsewhere) index = Index([0, "a", 1, "b", 2, "c"]) @@ -103,7 +102,6 @@ def test_intersection_base(self, sort): tm.assert_index_equal(result, expected) @pytest.mark.parametrize("klass", [np.array, Series, list]) - @pytest.mark.parametrize("sort", [None, False]) def test_intersection_different_type_base(self, klass, sort): # GH 10149 index = Index([0, "a", 1, "b", 2, "c"]) @@ -123,7 +121,6 @@ def test_intersection_equal_sort(self): tm.assert_index_equal(idx.intersection(idx, sort=False), idx) tm.assert_index_equal(idx.intersection(idx, sort=None), idx) - @pytest.mark.parametrize("sort", [None, False]) def test_difference_base(self, sort): # (same results for py2 and py3 but sortedness not tested elsewhere) index = Index([0, "a", 1, "b", 2, "c"]) diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index c18cd1f252c83..83fe21fd20bfe 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas._config.config as cf - from pandas._libs import index as libindex from pandas.core.dtypes.dtypes import CategoricalDtype @@ -100,65 +98,6 @@ def test_method_delegation(self): with pytest.raises(ValueError, match=msg): ci.set_categories(list("cab"), inplace=True) - def test_contains(self): - - ci = self.create_index(categories=list("cabdef")) - - assert "a" in ci - assert "z" not in ci - assert "e" not in ci - assert np.nan not in ci - - # assert codes NOT in index - assert 0 not in ci - assert 1 not in ci - - ci = CategoricalIndex(list("aabbca") + [np.nan], categories=list("cabdef")) - assert np.nan in ci - - @pytest.mark.parametrize( - "item, expected", - [ - (pd.Interval(0, 1), True), - (1.5, True), - (pd.Interval(0.5, 1.5), False), - ("a", False), - (pd.Timestamp(1), False), - (pd.Timedelta(1), False), - ], - ids=str, - ) - def test_contains_interval(self, item, expected): - # GH 23705 - ci = CategoricalIndex(IntervalIndex.from_breaks(range(3))) - result = item in ci - assert result is expected - - def test_contains_list(self): - # GH#21729 - idx = pd.CategoricalIndex([1, 2, 3]) - - assert "a" not in idx - - with pytest.raises(TypeError, match="unhashable type"): - ["a"] in idx - - with pytest.raises(TypeError, match="unhashable type"): - ["a", "b"] in idx - - @pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series]) - def test_where(self, klass): - i = self.create_index() - cond = [True] * len(i) - expected = i - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) - - cond = [False] + [True] * (len(i) - 1) - expected = CategoricalIndex([np.nan] + i[1:].tolist(), categories=i.categories) - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) - def test_append(self): ci = self.create_index() @@ -253,7 +192,7 @@ def test_delete(self): expected = CategoricalIndex(list("aabbc"), categories=categories) tm.assert_index_equal(result, expected, exact=True) - with pytest.raises((IndexError, ValueError)): + with tm.external_error_raised((IndexError, ValueError)): # Either depending on NumPy version ci.delete(10) @@ -353,16 +292,81 @@ def test_is_monotonic(self, data, non_lexsorted_data): assert c.is_monotonic_decreasing is False def test_has_duplicates(self): - idx = CategoricalIndex([0, 0, 0], name="foo") assert idx.is_unique is False assert idx.has_duplicates is True - def test_drop_duplicates(self): + idx = CategoricalIndex([0, 1], categories=[2, 3], name="foo") + assert idx.is_unique is False + assert idx.has_duplicates is True + + idx = CategoricalIndex([0, 1, 2, 3], categories=[1, 2, 3], name="foo") + assert idx.is_unique is True + assert idx.has_duplicates is False - idx = CategoricalIndex([0, 0, 0], name="foo") - expected = CategoricalIndex([0], name="foo") - tm.assert_index_equal(idx.drop_duplicates(), expected) + @pytest.mark.parametrize( + "data, categories, expected", + [ + ( + [1, 1, 1], + [1, 2, 3], + { + "first": np.array([False, True, True]), + "last": np.array([True, True, False]), + False: np.array([True, True, True]), + }, + ), + ( + [1, 1, 1], + list("abc"), + { + "first": np.array([False, True, True]), + "last": np.array([True, True, False]), + False: np.array([True, True, True]), + }, + ), + ( + [2, "a", "b"], + list("abc"), + { + "first": np.zeros(shape=(3), dtype=np.bool), + "last": np.zeros(shape=(3), dtype=np.bool), + False: np.zeros(shape=(3), dtype=np.bool), + }, + ), + ( + list("abb"), + list("abc"), + { + "first": np.array([False, False, True]), + "last": np.array([False, True, False]), + False: np.array([False, True, True]), + }, + ), + ], + ) + def test_drop_duplicates(self, data, categories, expected): + + idx = CategoricalIndex(data, categories=categories, name="foo") + for keep, e in expected.items(): + tm.assert_numpy_array_equal(idx.duplicated(keep=keep), e) + e = idx[~e] + result = idx.drop_duplicates(keep=keep) + tm.assert_index_equal(result, e) + + @pytest.mark.parametrize( + "data, categories, expected_data, expected_categories", + [ + ([1, 1, 1], [1, 2, 3], [1], [1]), + ([1, 1, 1], list("abc"), [np.nan], []), + ([1, 2, "a"], [1, 2, 3], [1, 2, np.nan], [1, 2]), + ([2, "a", "b"], list("abc"), [np.nan, "a", "b"], ["a", "b"]), + ], + ) + def test_unique(self, data, categories, expected_data, expected_categories): + + idx = CategoricalIndex(data, categories=categories) + expected = CategoricalIndex(expected_data, categories=expected_categories) tm.assert_index_equal(idx.unique(), expected) def test_repr_roundtrip(self): @@ -488,7 +492,7 @@ def test_equals_categorical(self): assert not ci.equals(CategoricalIndex(list("aabca") + [np.nan], ordered=True)) assert ci.equals(ci.copy()) - def test_equals_categoridcal_unordered(self): + def test_equals_categorical_unordered(self): # https://github.com/pandas-dev/pandas/issues/16603 a = pd.CategoricalIndex(["A"], categories=["A", "B"]) b = pd.CategoricalIndex(["A"], categories=["B", "A"]) @@ -503,118 +507,6 @@ def test_frame_repr(self): expected = " A\na 1\nb 2\nc 3" assert result == expected - def test_string_categorical_index_repr(self): - # short - idx = pd.CategoricalIndex(["a", "bb", "ccc"]) - expected = """CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa - assert repr(idx) == expected - - # multiple lines - idx = pd.CategoricalIndex(["a", "bb", "ccc"] * 10) - expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', - 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', - 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], - categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa - - assert repr(idx) == expected - - # truncated - idx = pd.CategoricalIndex(["a", "bb", "ccc"] * 100) - expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', - ... - 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], - categories=['a', 'bb', 'ccc'], ordered=False, dtype='category', length=300)""" # noqa - - assert repr(idx) == expected - - # larger categories - idx = pd.CategoricalIndex(list("abcdefghijklmmo")) - expected = """CategoricalIndex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', - 'm', 'm', 'o'], - categories=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', ...], ordered=False, dtype='category')""" # noqa - - assert repr(idx) == expected - - # short - idx = pd.CategoricalIndex(["あ", "いい", "ううう"]) - expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa - assert repr(idx) == expected - - # multiple lines - idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 10) - expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', - 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', - 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], - categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa - - assert repr(idx) == expected - - # truncated - idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 100) - expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', - ... - 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], - categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa - - assert repr(idx) == expected - - # larger categories - idx = pd.CategoricalIndex(list("あいうえおかきくけこさしすせそ")) - expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', - 'す', 'せ', 'そ'], - categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')""" # noqa - - assert repr(idx) == expected - - # Emable Unicode option ----------------------------------------- - with cf.option_context("display.unicode.east_asian_width", True): - - # short - idx = pd.CategoricalIndex(["あ", "いい", "ううう"]) - expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa - assert repr(idx) == expected - - # multiple lines - idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 10) - expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', - 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', - 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', - 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], - categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa - - assert repr(idx) == expected - - # truncated - idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 100) - expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', - 'ううう', 'あ', - ... - 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', - 'あ', 'いい', 'ううう'], - categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa - - assert repr(idx) == expected - - # larger categories - idx = pd.CategoricalIndex(list("あいうえおかきくけこさしすせそ")) - expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', - 'さ', 'し', 'す', 'せ', 'そ'], - categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')""" # noqa - - assert repr(idx) == expected - - def test_fillna_categorical(self): - # GH 11343 - idx = CategoricalIndex([1.0, np.nan, 3.0, 1.0], name="x") - # fill by value in categories - exp = CategoricalIndex([1.0, 1.0, 3.0, 1.0], name="x") - tm.assert_index_equal(idx.fillna(1.0), exp) - - # fill by value not in categories raises ValueError - msg = "fill value must be in categories" - with pytest.raises(ValueError, match=msg): - idx.fillna(2.0) - @pytest.mark.parametrize( "dtype, engine_type", [ diff --git a/pandas/tests/indexes/categorical/test_fillna.py b/pandas/tests/indexes/categorical/test_fillna.py new file mode 100644 index 0000000000000..0d878249d3800 --- /dev/null +++ b/pandas/tests/indexes/categorical/test_fillna.py @@ -0,0 +1,19 @@ +import numpy as np +import pytest + +from pandas import CategoricalIndex +import pandas._testing as tm + + +class TestFillNA: + def test_fillna_categorical(self): + # GH#11343 + idx = CategoricalIndex([1.0, np.nan, 3.0, 1.0], name="x") + # fill by value in categories + exp = CategoricalIndex([1.0, 1.0, 3.0, 1.0], name="x") + tm.assert_index_equal(idx.fillna(1.0), exp) + + # fill by value not in categories raises ValueError + msg = "fill value must be in categories" + with pytest.raises(ValueError, match=msg): + idx.fillna(2.0) diff --git a/pandas/tests/indexes/categorical/test_formats.py b/pandas/tests/indexes/categorical/test_formats.py new file mode 100644 index 0000000000000..a5607224f6448 --- /dev/null +++ b/pandas/tests/indexes/categorical/test_formats.py @@ -0,0 +1,108 @@ +""" +Tests for CategoricalIndex.__repr__ and related methods. +""" +import pandas._config.config as cf + +import pandas as pd + + +class TestCategoricalIndexRepr: + def test_string_categorical_index_repr(self): + # short + idx = pd.CategoricalIndex(["a", "bb", "ccc"]) + expected = """CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa + assert repr(idx) == expected + + # multiple lines + idx = pd.CategoricalIndex(["a", "bb", "ccc"] * 10) + expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', + 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', + 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], + categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa + + assert repr(idx) == expected + + # truncated + idx = pd.CategoricalIndex(["a", "bb", "ccc"] * 100) + expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', + ... + 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], + categories=['a', 'bb', 'ccc'], ordered=False, dtype='category', length=300)""" # noqa + + assert repr(idx) == expected + + # larger categories + idx = pd.CategoricalIndex(list("abcdefghijklmmo")) + expected = """CategoricalIndex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', + 'm', 'm', 'o'], + categories=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', ...], ordered=False, dtype='category')""" # noqa + + assert repr(idx) == expected + + # short + idx = pd.CategoricalIndex(["あ", "いい", "ううう"]) + expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa + assert repr(idx) == expected + + # multiple lines + idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 10) + expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', + 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa + + assert repr(idx) == expected + + # truncated + idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 100) + expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', + ... + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa + + assert repr(idx) == expected + + # larger categories + idx = pd.CategoricalIndex(list("あいうえおかきくけこさしすせそ")) + expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', + 'す', 'せ', 'そ'], + categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')""" # noqa + + assert repr(idx) == expected + + # Emable Unicode option ----------------------------------------- + with cf.option_context("display.unicode.east_asian_width", True): + + # short + idx = pd.CategoricalIndex(["あ", "いい", "ううう"]) + expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa + assert repr(idx) == expected + + # multiple lines + idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 10) + expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', + 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa + + assert repr(idx) == expected + + # truncated + idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 100) + expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', + 'ううう', 'あ', + ... + 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', + 'あ', 'いい', 'ううう'], + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa + + assert repr(idx) == expected + + # larger categories + idx = pd.CategoricalIndex(list("あいうえおかきくけこさしすせそ")) + expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', + 'さ', 'し', 'す', 'せ', 'そ'], + categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')""" # noqa + + assert repr(idx) == expected diff --git a/pandas/tests/indexes/categorical/test_indexing.py b/pandas/tests/indexes/categorical/test_indexing.py index 507e38d9acac2..a36568bbbe633 100644 --- a/pandas/tests/indexes/categorical/test_indexing.py +++ b/pandas/tests/indexes/categorical/test_indexing.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import CategoricalIndex, Index +from pandas import CategoricalIndex, Index, IntervalIndex import pandas._testing as tm @@ -65,7 +65,8 @@ def test_take_fill_value(self): with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) - with pytest.raises(IndexError): + msg = "index -5 is out of bounds for (axis 0 with )?size 3" + with pytest.raises(IndexError, match=msg): idx.take(np.array([1, -5])) def test_take_fill_value_datetime(self): @@ -104,7 +105,8 @@ def test_take_fill_value_datetime(self): with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) - with pytest.raises(IndexError): + msg = "index -5 is out of bounds for (axis 0 with )?size 3" + with pytest.raises(IndexError, match=msg): idx.take(np.array([1, -5])) def test_take_invalid_kwargs(self): @@ -248,3 +250,67 @@ def test_get_indexer(self): msg = "method='nearest' not implemented yet for CategoricalIndex" with pytest.raises(NotImplementedError, match=msg): idx2.get_indexer(idx1, method="nearest") + + +class TestWhere: + @pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series]) + def test_where(self, klass): + i = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) + cond = [True] * len(i) + expected = i + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) + + cond = [False] + [True] * (len(i) - 1) + expected = CategoricalIndex([np.nan] + i[1:].tolist(), categories=i.categories) + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) + + +class TestContains: + def test_contains(self): + + ci = CategoricalIndex(list("aabbca"), categories=list("cabdef"), ordered=False) + + assert "a" in ci + assert "z" not in ci + assert "e" not in ci + assert np.nan not in ci + + # assert codes NOT in index + assert 0 not in ci + assert 1 not in ci + + def test_contains_nan(self): + ci = CategoricalIndex(list("aabbca") + [np.nan], categories=list("cabdef")) + assert np.nan in ci + + @pytest.mark.parametrize( + "item, expected", + [ + (pd.Interval(0, 1), True), + (1.5, True), + (pd.Interval(0.5, 1.5), False), + ("a", False), + (pd.Timestamp(1), False), + (pd.Timedelta(1), False), + ], + ids=str, + ) + def test_contains_interval(self, item, expected): + # GH 23705 + ci = CategoricalIndex(IntervalIndex.from_breaks(range(3))) + result = item in ci + assert result is expected + + def test_contains_list(self): + # GH#21729 + idx = pd.CategoricalIndex([1, 2, 3]) + + assert "a" not in idx + + with pytest.raises(TypeError, match="unhashable type"): + ["a"] in idx + + with pytest.raises(TypeError, match="unhashable type"): + ["a", "b"] in idx diff --git a/pandas/tests/indexes/categorical/test_map.py b/pandas/tests/indexes/categorical/test_map.py index 943359a72e971..6cef555275444 100644 --- a/pandas/tests/indexes/categorical/test_map.py +++ b/pandas/tests/indexes/categorical/test_map.py @@ -15,12 +15,12 @@ class TestMap: ], ids=["string", "interval"], ) - def test_map_str(self, data, categories, ordered_fixture): + def test_map_str(self, data, categories, ordered): # GH 31202 - override base class since we want to maintain categorical/ordered - index = CategoricalIndex(data, categories=categories, ordered=ordered_fixture) + index = CategoricalIndex(data, categories=categories, ordered=ordered) result = index.map(str) expected = CategoricalIndex( - map(str, data), categories=map(str, categories), ordered=ordered_fixture + map(str, data), categories=map(str, categories), ordered=ordered ) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index dca317a9eb03f..964cf320a422b 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -35,6 +35,9 @@ class Base: _holder: Optional[Type[Index]] = None _compat_props = ["shape", "ndim", "size", "nbytes"] + def create_index(self) -> Index: + raise NotImplementedError("Method not implemented") + def test_pickle_compat_construction(self): # need an object to create with msg = ( @@ -46,34 +49,6 @@ def test_pickle_compat_construction(self): with pytest.raises(TypeError, match=msg): self._holder() - def test_to_series(self): - # assert that we are creating a copy of the index - - idx = self.create_index() - s = idx.to_series() - assert s.values is not idx.values - assert s.index is not idx - assert s.name == idx.name - - def test_to_series_with_arguments(self): - # GH18699 - - # index kwarg - idx = self.create_index() - s = idx.to_series(index=idx) - - assert s.values is not idx.values - assert s.index is idx - assert s.name == idx.name - - # name kwarg - idx = self.create_index() - s = idx.to_series(name="__test") - - assert s.values is not idx.values - assert s.index is not idx - assert s.name != idx.name - @pytest.mark.parametrize("name", [None, "new_name"]) def test_to_frame(self, name): # see GH-15230, GH-22580 @@ -195,15 +170,6 @@ def test_logical_compat(self): with pytest.raises(TypeError, match="cannot perform any"): idx.any() - def test_boolean_context_compat(self): - - # boolean context compat - idx = self.create_index() - - with pytest.raises(ValueError, match="The truth value of a"): - if idx: - pass - def test_reindex_base(self): idx = self.create_index() expected = np.arange(idx.size, dtype=np.intp) @@ -250,14 +216,6 @@ def test_repr_roundtrip(self): idx = self.create_index() tm.assert_index_equal(eval(repr(idx)), idx) - def test_str(self): - - # test the string repr - idx = self.create_index() - idx.name = "foo" - assert "'foo'" in str(idx) - assert type(idx).__name__ in str(idx) - def test_repr_max_seq_item_setting(self): # GH10182 idx = self.create_index() @@ -310,16 +268,11 @@ def test_ensure_copied_data(self, indices): result = result.tz_localize("UTC").tz_convert(indices.tz) tm.assert_index_equal(indices, result) - tm.assert_numpy_array_equal( - indices._ndarray_values, result._ndarray_values, check_same="copy" - ) if isinstance(indices, PeriodIndex): # .values an object array of Period, thus copied result = index_type(ordinal=indices.asi8, copy=False, **init_kwargs) - tm.assert_numpy_array_equal( - indices._ndarray_values, result._ndarray_values, check_same="same" - ) + tm.assert_numpy_array_equal(indices.asi8, result.asi8, check_same="same") elif isinstance(indices, IntervalIndex): # checked in test_interval.py pass @@ -328,9 +281,6 @@ def test_ensure_copied_data(self, indices): tm.assert_numpy_array_equal( indices.values, result.values, check_same="same" ) - tm.assert_numpy_array_equal( - indices._ndarray_values, result._ndarray_values, check_same="same" - ) def test_memory_usage(self, indices): indices._engine.clear_mapping() @@ -512,7 +462,6 @@ def test_union_base(self, indices): with pytest.raises(TypeError, match=msg): first.union([1, 2, 3]) - @pytest.mark.parametrize("sort", [None, False]) def test_difference_base(self, sort, indices): first = indices[2:] second = indices[:4] @@ -917,3 +866,29 @@ def test_contains_requires_hashable_raises(self): with pytest.raises(TypeError): {} in idx._engine + + def test_copy_copies_cache(self): + # GH32898 + idx = self.create_index() + idx.get_loc(idx[0]) # populates the _cache. + copy = idx.copy() + + # check that the copied cache is a copy of the original + assert idx._cache == copy._cache + assert idx._cache is not copy._cache + # cache values should reference the same object + for key, val in idx._cache.items(): + assert copy._cache[key] is val, key + + def test_shallow_copy_copies_cache(self): + # GH32669 + idx = self.create_index() + idx.get_loc(idx[0]) # populates the _cache. + shallow_copy = idx._shallow_copy() + + # check that the shallow_copied cache is a copy of the original + assert idx._cache == shallow_copy._cache + assert idx._cache is not shallow_copy._cache + # cache values should reference the same object + for key, val in idx._cache.items(): + assert shallow_copy._cache[key] is val, key diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py new file mode 100644 index 0000000000000..fb17e1df6341b --- /dev/null +++ b/pandas/tests/indexes/conftest.py @@ -0,0 +1,27 @@ +import pytest + + +@pytest.fixture(params=[None, False]) +def sort(request): + """ + Valid values for the 'sort' parameter used in the Index + setops methods (intersection, union, etc.) + + Caution: + Don't confuse this one with the "sort" fixture used + for DataFrame.append or concat. That one has + parameters [True, False]. + + We can't combine them as sort=True is not permitted + in in the Index setops methods. + """ + return request.param + + +@pytest.fixture(params=["D", "3D", "-3D", "H", "2H", "-2H", "T", "2T", "S", "-3S"]) +def freq_sample(request): + """ + Valid values for 'freq' parameter used to create date_range and + timedelta_range.. + """ + return request.param diff --git a/pandas/tests/indexes/datetimes/test_astype.py b/pandas/tests/indexes/datetimes/test_astype.py index 916f722247a14..34169a670c169 100644 --- a/pandas/tests/indexes/datetimes/test_astype.py +++ b/pandas/tests/indexes/datetimes/test_astype.py @@ -22,27 +22,32 @@ class TestDatetimeIndex: def test_astype(self): # GH 13149, GH 13209 - idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN]) + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN], name="idx") result = idx.astype(object) - expected = Index([Timestamp("2016-05-16")] + [NaT] * 3, dtype=object) + expected = Index( + [Timestamp("2016-05-16")] + [NaT] * 3, dtype=object, name="idx" + ) tm.assert_index_equal(result, expected) result = idx.astype(int) expected = Int64Index( - [1463356800000000000] + [-9223372036854775808] * 3, dtype=np.int64 + [1463356800000000000] + [-9223372036854775808] * 3, + dtype=np.int64, + name="idx", ) tm.assert_index_equal(result, expected) - rng = date_range("1/1/2000", periods=10) + rng = date_range("1/1/2000", periods=10, name="idx") result = rng.astype("i8") - tm.assert_index_equal(result, Index(rng.asi8)) + tm.assert_index_equal(result, Index(rng.asi8, name="idx")) tm.assert_numpy_array_equal(result.values, rng.asi8) def test_astype_uint(self): - arr = date_range("2000", periods=2) + arr = date_range("2000", periods=2, name="idx") expected = pd.UInt64Index( - np.array([946684800000000000, 946771200000000000], dtype="uint64") + np.array([946684800000000000, 946771200000000000], dtype="uint64"), + name="idx", ) tm.assert_index_equal(arr.astype("uint64"), expected) @@ -148,7 +153,7 @@ def test_astype_str(self): def test_astype_datetime64(self): # GH 13149, GH 13209 - idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN]) + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN], name="idx") result = idx.astype("datetime64[ns]") tm.assert_index_equal(result, idx) @@ -158,10 +163,12 @@ def test_astype_datetime64(self): tm.assert_index_equal(result, idx) assert result is idx - idx_tz = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN], tz="EST") + idx_tz = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN], tz="EST", name="idx") result = idx_tz.astype("datetime64[ns]") expected = DatetimeIndex( - ["2016-05-16 05:00:00", "NaT", "NaT", "NaT"], dtype="datetime64[ns]" + ["2016-05-16 05:00:00", "NaT", "NaT", "NaT"], + dtype="datetime64[ns]", + name="idx", ) tm.assert_index_equal(result, expected) @@ -273,8 +280,8 @@ def _check_rng(rng): def test_integer_index_astype_datetime(self, tz, dtype): # GH 20997, 20964, 24559 val = [pd.Timestamp("2018-01-01", tz=tz).value] - result = pd.Index(val).astype(dtype) - expected = pd.DatetimeIndex(["2018-01-01"], tz=tz) + result = pd.Index(val, name="idx").astype(dtype) + expected = pd.DatetimeIndex(["2018-01-01"], tz=tz, name="idx") tm.assert_index_equal(result, expected) def test_dti_astype_period(self): @@ -292,10 +299,11 @@ def test_dti_astype_period(self): class TestAstype: @pytest.mark.parametrize("tz", [None, "US/Central"]) def test_astype_category(self, tz): - obj = pd.date_range("2000", periods=2, tz=tz) + obj = pd.date_range("2000", periods=2, tz=tz, name="idx") result = obj.astype("category") expected = pd.CategoricalIndex( - [pd.Timestamp("2000-01-01", tz=tz), pd.Timestamp("2000-01-02", tz=tz)] + [pd.Timestamp("2000-01-01", tz=tz), pd.Timestamp("2000-01-02", tz=tz)], + name="idx", ) tm.assert_index_equal(result, expected) @@ -305,9 +313,9 @@ def test_astype_category(self, tz): @pytest.mark.parametrize("tz", [None, "US/Central"]) def test_astype_array_fallback(self, tz): - obj = pd.date_range("2000", periods=2, tz=tz) + obj = pd.date_range("2000", periods=2, tz=tz, name="idx") result = obj.astype(bool) - expected = pd.Index(np.array([True, True])) + expected = pd.Index(np.array([True, True]), name="idx") tm.assert_index_equal(result, expected) result = obj._data.astype(bool) diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index b293c008d6683..0247947ff19c5 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -415,7 +415,8 @@ def test_construction_dti_with_mixed_timezones(self): # tz mismatch affecting to tz-aware raises TypeError/ValueError - with pytest.raises(ValueError): + msg = "cannot be converted to datetime64" + with pytest.raises(ValueError, match=msg): DatetimeIndex( [ Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), @@ -424,7 +425,6 @@ def test_construction_dti_with_mixed_timezones(self): name="idx", ) - msg = "cannot be converted to datetime64" with pytest.raises(ValueError, match=msg): DatetimeIndex( [ @@ -435,7 +435,7 @@ def test_construction_dti_with_mixed_timezones(self): name="idx", ) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): DatetimeIndex( [ Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), @@ -480,7 +480,8 @@ def test_construction_outofbounds(self): # coerces to object tm.assert_index_equal(Index(dates), exp) - with pytest.raises(OutOfBoundsDatetime): + msg = "Out of bounds nanosecond timestamp" + with pytest.raises(OutOfBoundsDatetime, match=msg): # can't create DatetimeIndex DatetimeIndex(dates) @@ -516,7 +517,8 @@ def test_constructor_coverage(self): with pytest.raises(TypeError, match=msg): date_range(start="1/1/2000", periods="foo", freq="D") - with pytest.raises(TypeError): + msg = "DatetimeIndex\\(\\) must be called with a collection" + with pytest.raises(TypeError, match=msg): DatetimeIndex("1/1/2000") # generator expression @@ -664,7 +666,8 @@ def test_constructor_dtype(self): @pytest.mark.parametrize("dtype", [object, np.int32, np.int64]) def test_constructor_invalid_dtype_raises(self, dtype): # GH 23986 - with pytest.raises(ValueError): + msg = "Unexpected value for 'dtype'" + with pytest.raises(ValueError, match=msg): DatetimeIndex([1, 2], dtype=dtype) def test_constructor_name(self): @@ -681,7 +684,8 @@ def test_000constructor_resolution(self): def test_disallow_setting_tz(self): # GH 3746 dti = DatetimeIndex(["2010"], tz="UTC") - with pytest.raises(AttributeError): + msg = "Cannot directly set timezone" + with pytest.raises(AttributeError, match=msg): dti.tz = pytz.timezone("US/Pacific") @pytest.mark.parametrize( @@ -770,7 +774,8 @@ def test_construction_from_replaced_timestamps_with_dst(self): def test_construction_with_tz_and_tz_aware_dti(self): # GH 23579 dti = date_range("2016-01-01", periods=3, tz="US/Central") - with pytest.raises(TypeError): + msg = "data is already tz-aware US/Central, unable to set specified tz" + with pytest.raises(TypeError, match=msg): DatetimeIndex(dti, tz="Asia/Tokyo") def test_construction_with_nat_and_tzlocal(self): @@ -790,7 +795,8 @@ def test_constructor_no_precision_raises(self): pd.Index(["2000"], dtype="datetime64") def test_constructor_wrong_precision_raises(self): - with pytest.raises(ValueError): + msg = "Unexpected value for 'dtype': 'datetime64\\[us\\]'" + with pytest.raises(ValueError, match=msg): pd.DatetimeIndex(["2000"], dtype="datetime64[us]") def test_index_constructor_with_numpy_object_array_and_timestamp_tz_with_nan(self): diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index d33351fe94a8c..b8200bb686aad 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -9,6 +9,7 @@ import pytz from pytz import timezone +from pandas._libs.tslibs import timezones from pandas.errors import OutOfBoundsDatetime import pandas.util._test_decorators as td @@ -153,9 +154,10 @@ def test_date_range_int64_overflow_stride_endpoint_different_signs(self): def test_date_range_out_of_bounds(self): # GH#14187 - with pytest.raises(OutOfBoundsDatetime): + msg = "Cannot generate range" + with pytest.raises(OutOfBoundsDatetime, match=msg): date_range("2016-01-01", periods=100000, freq="D") - with pytest.raises(OutOfBoundsDatetime): + with pytest.raises(OutOfBoundsDatetime, match=msg): date_range(end="1763-10-12", periods=100000, freq="D") def test_date_range_gen_error(self): @@ -661,6 +663,60 @@ def test_negative_non_tick_frequency_descending_dates(self, tz_aware_fixture): tm.assert_index_equal(result, expected) +class TestDateRangeTZ: + """Tests for date_range with timezones""" + + def test_hongkong_tz_convert(self): + # GH#1673 smoke test + dr = date_range("2012-01-01", "2012-01-10", freq="D", tz="Hongkong") + + # it works! + dr.hour + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_date_range_span_dst_transition(self, tzstr): + # GH#1778 + + # Standard -> Daylight Savings Time + dr = date_range("03/06/2012 00:00", periods=200, freq="W-FRI", tz="US/Eastern") + + assert (dr.hour == 0).all() + + dr = date_range("2012-11-02", periods=10, tz=tzstr) + result = dr.hour + expected = pd.Index([0] * 10) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_date_range_timezone_str_argument(self, tzstr): + tz = timezones.maybe_get_tz(tzstr) + result = date_range("1/1/2000", periods=10, tz=tzstr) + expected = date_range("1/1/2000", periods=10, tz=tz) + + tm.assert_index_equal(result, expected) + + def test_date_range_with_fixedoffset_noname(self): + from pandas.tests.indexes.datetimes.test_timezones import fixed_off_no_name + + off = fixed_off_no_name + start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off) + end = datetime(2012, 6, 11, 5, 0, 0, tzinfo=off) + rng = date_range(start=start, end=end) + assert off == rng.tz + + idx = pd.Index([start, end]) + assert off == idx.tz + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_date_range_with_tz(self, tzstr): + stamp = Timestamp("3/11/2012 05:00", tz=tzstr) + assert stamp.hour == 5 + + rng = date_range("3/11/2012 04:00", periods=10, freq="H", tz=tzstr) + + assert stamp == rng[1] + + class TestGenRangeGeneration: def test_generate(self): rng1 = list(generate_range(START, END, offset=BDay())) @@ -736,9 +792,10 @@ def test_precision_finer_than_offset(self): ) def test_mismatching_tz_raises_err(self, start, end): # issue 18488 - with pytest.raises(TypeError): + msg = "Start and end cannot both be tz-aware with different timezones" + with pytest.raises(TypeError, match=msg): pd.date_range(start, end) - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): pd.date_range(start, end, freq=BDay()) @@ -771,16 +828,17 @@ def test_misc(self): def test_date_parse_failure(self): badly_formed_date = "2007/100/1" - with pytest.raises(ValueError): + msg = "could not convert string to Timestamp" + with pytest.raises(ValueError, match=msg): Timestamp(badly_formed_date) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): bdate_range(start=badly_formed_date, periods=10) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): bdate_range(end=badly_formed_date, periods=10) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): bdate_range(badly_formed_date, badly_formed_date) def test_daterange_bug_456(self): @@ -813,8 +871,9 @@ def test_bday_near_overflow(self): def test_bday_overflow_error(self): # GH#24252 check that we get OutOfBoundsDatetime and not OverflowError + msg = "Out of bounds nanosecond timestamp" start = pd.Timestamp.max.floor("D").to_pydatetime() - with pytest.raises(OutOfBoundsDatetime): + with pytest.raises(OutOfBoundsDatetime, match=msg): pd.date_range(start, periods=2, freq="B") diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 1a72ef2bdf1aa..e109c7a4f1c8d 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -1,11 +1,11 @@ -from datetime import date +from datetime import date, timedelta import dateutil import numpy as np import pytest import pandas as pd -from pandas import DataFrame, DatetimeIndex, Index, Timestamp, date_range, offsets +from pandas import DataFrame, DatetimeIndex, Index, NaT, Timestamp, date_range, offsets import pandas._testing as tm randn = np.random.randn @@ -20,12 +20,69 @@ def test_roundtrip_pickle_with_tz(self): unpickled = tm.round_trip_pickle(index) tm.assert_index_equal(index, unpickled) + def test_pickle(self): + + # GH#4606 + p = tm.round_trip_pickle(NaT) + assert p is NaT + + idx = pd.to_datetime(["2013-01-01", NaT, "2014-01-06"]) + idx_p = tm.round_trip_pickle(idx) + assert idx_p[0] == idx[0] + assert idx_p[1] is NaT + assert idx_p[2] == idx[2] + + # GH#11002 + # don't infer freq + idx = date_range("1750-1-1", "2050-1-1", freq="7D") + idx_p = tm.round_trip_pickle(idx) + tm.assert_index_equal(idx, idx_p) + def test_reindex_preserves_tz_if_target_is_empty_list_or_array(self): # GH7774 index = date_range("20130101", periods=3, tz="US/Eastern") assert str(index.reindex([])[0].tz) == "US/Eastern" assert str(index.reindex(np.array([]))[0].tz) == "US/Eastern" + def test_reindex_with_same_tz(self): + # GH 32740 + rng_a = date_range("2010-01-01", "2010-01-02", periods=24, tz="utc") + rng_b = date_range("2010-01-01", "2010-01-02", periods=23, tz="utc") + result1, result2 = rng_a.reindex( + rng_b, method="nearest", tolerance=timedelta(seconds=20) + ) + expected_list1 = [ + "2010-01-01 00:00:00", + "2010-01-01 01:05:27.272727272", + "2010-01-01 02:10:54.545454545", + "2010-01-01 03:16:21.818181818", + "2010-01-01 04:21:49.090909090", + "2010-01-01 05:27:16.363636363", + "2010-01-01 06:32:43.636363636", + "2010-01-01 07:38:10.909090909", + "2010-01-01 08:43:38.181818181", + "2010-01-01 09:49:05.454545454", + "2010-01-01 10:54:32.727272727", + "2010-01-01 12:00:00", + "2010-01-01 13:05:27.272727272", + "2010-01-01 14:10:54.545454545", + "2010-01-01 15:16:21.818181818", + "2010-01-01 16:21:49.090909090", + "2010-01-01 17:27:16.363636363", + "2010-01-01 18:32:43.636363636", + "2010-01-01 19:38:10.909090909", + "2010-01-01 20:43:38.181818181", + "2010-01-01 21:49:05.454545454", + "2010-01-01 22:54:32.727272727", + "2010-01-02 00:00:00", + ] + expected1 = DatetimeIndex( + expected_list1, dtype="datetime64[ns, UTC]", freq=None, + ) + expected2 = np.array([0] + [-1] * 21 + [23], dtype=np.int64,) + tm.assert_index_equal(result1, expected1) + tm.assert_numpy_array_equal(result2, expected2) + def test_time_loc(self): # GH8667 from datetime import time from pandas._libs.index import _SIZE_CUTOFF @@ -86,13 +143,6 @@ def test_week_of_month_frequency(self): expected = DatetimeIndex(dates, freq="WOM-1SAT") tm.assert_index_equal(result, expected) - def test_hash_error(self): - index = date_range("20010101", periods=10) - with pytest.raises( - TypeError, match=f"unhashable type: '{type(index).__name__}'" - ): - hash(index) - def test_stringified_slice_with_tz(self): # GH#2658 start = "2013-01-07" @@ -407,3 +457,11 @@ def test_index_map(self, name): ((2018,), range(1, 7)), names=[name, name] ) tm.assert_index_equal(index, exp_index) + + def test_split_non_utc(self): + # GH 14042 + indices = pd.date_range("2016-01-01 00:00:00+0200", freq="S", periods=10) + result = np.split(indices, indices_or_sections=[])[0] + expected = indices.copy() + expected._set_freq(None) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py index da1bd6f091d1a..e4785e5f80256 100644 --- a/pandas/tests/indexes/datetimes/test_datetimelike.py +++ b/pandas/tests/indexes/datetimes/test_datetimelike.py @@ -17,7 +17,7 @@ class TestDatetimeIndex(DatetimeLike): def indices(self, request): return request.param - def create_index(self): + def create_index(self) -> DatetimeIndex: return date_range("20130101", periods=5) def test_shift(self): diff --git a/pandas/tests/indexes/datetimes/test_delete.py b/pandas/tests/indexes/datetimes/test_delete.py new file mode 100644 index 0000000000000..4fbb440bc89e5 --- /dev/null +++ b/pandas/tests/indexes/datetimes/test_delete.py @@ -0,0 +1,134 @@ +import pytest + +from pandas import DatetimeIndex, Series, date_range +import pandas._testing as tm + + +class TestDelete: + def test_delete(self): + idx = date_range(start="2000-01-01", periods=5, freq="M", name="idx") + + # preserve freq + expected_0 = date_range(start="2000-02-01", periods=4, freq="M", name="idx") + expected_4 = date_range(start="2000-01-01", periods=4, freq="M", name="idx") + + # reset freq to None + expected_1 = DatetimeIndex( + ["2000-01-31", "2000-03-31", "2000-04-30", "2000-05-31"], + freq=None, + name="idx", + ) + + cases = { + 0: expected_0, + -5: expected_0, + -1: expected_4, + 4: expected_4, + 1: expected_1, + } + for n, expected in cases.items(): + result = idx.delete(n) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + with pytest.raises((IndexError, ValueError), match="out of bounds"): + # either depending on numpy version + idx.delete(5) + + for tz in [None, "Asia/Tokyo", "US/Pacific"]: + idx = date_range( + start="2000-01-01 09:00", periods=10, freq="H", name="idx", tz=tz + ) + + expected = date_range( + start="2000-01-01 10:00", periods=9, freq="H", name="idx", tz=tz + ) + result = idx.delete(0) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freqstr == "H" + assert result.tz == expected.tz + + expected = date_range( + start="2000-01-01 09:00", periods=9, freq="H", name="idx", tz=tz + ) + result = idx.delete(-1) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freqstr == "H" + assert result.tz == expected.tz + + def test_delete_slice(self): + idx = date_range(start="2000-01-01", periods=10, freq="D", name="idx") + + # preserve freq + expected_0_2 = date_range(start="2000-01-04", periods=7, freq="D", name="idx") + expected_7_9 = date_range(start="2000-01-01", periods=7, freq="D", name="idx") + + # reset freq to None + expected_3_5 = DatetimeIndex( + [ + "2000-01-01", + "2000-01-02", + "2000-01-03", + "2000-01-07", + "2000-01-08", + "2000-01-09", + "2000-01-10", + ], + freq=None, + name="idx", + ) + + cases = { + (0, 1, 2): expected_0_2, + (7, 8, 9): expected_7_9, + (3, 4, 5): expected_3_5, + } + for n, expected in cases.items(): + result = idx.delete(n) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + result = idx.delete(slice(n[0], n[-1] + 1)) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + for tz in [None, "Asia/Tokyo", "US/Pacific"]: + ts = Series( + 1, + index=date_range( + "2000-01-01 09:00", periods=10, freq="H", name="idx", tz=tz + ), + ) + # preserve freq + result = ts.drop(ts.index[:5]).index + expected = date_range( + "2000-01-01 14:00", periods=5, freq="H", name="idx", tz=tz + ) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + assert result.tz == expected.tz + + # reset freq to None + result = ts.drop(ts.index[[1, 3, 5, 7, 9]]).index + expected = DatetimeIndex( + [ + "2000-01-01 09:00", + "2000-01-01 11:00", + "2000-01-01 13:00", + "2000-01-01 15:00", + "2000-01-01 17:00", + ], + freq=None, + name="idx", + tz=tz, + ) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + assert result.tz == expected.tz diff --git a/pandas/tests/indexes/datetimes/test_missing.py b/pandas/tests/indexes/datetimes/test_fillna.py similarity index 98% rename from pandas/tests/indexes/datetimes/test_missing.py rename to pandas/tests/indexes/datetimes/test_fillna.py index 3399c8eaf6750..5fbe60bb0c50f 100644 --- a/pandas/tests/indexes/datetimes/test_missing.py +++ b/pandas/tests/indexes/datetimes/test_fillna.py @@ -4,7 +4,7 @@ import pandas._testing as tm -class TestDatetimeIndex: +class TestDatetimeIndexFillNA: @pytest.mark.parametrize("tz", ["US/Eastern", "Asia/Tokyo"]) def test_fillna_datetime64(self, tz): # GH 11343 diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index ceab670fb5041..ff15cded19b1c 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -2,7 +2,6 @@ import numpy as np import pytest -import pytz import pandas as pd from pandas import DatetimeIndex, Index, Timestamp, date_range, notna @@ -25,6 +24,13 @@ def test_ellipsis(self): assert result.equals(idx) assert result is not idx + def test_getitem_slice_keeps_name(self): + # GH4226 + st = pd.Timestamp("2013-07-01 00:00:00", tz="America/Los_Angeles") + et = pd.Timestamp("2013-07-02 00:00:00", tz="America/Los_Angeles") + dr = pd.date_range(st, et, freq="H", name="timebucket") + assert dr[1:].name == dr.name + def test_getitem(self): idx1 = pd.date_range("2011-01-01", "2011-01-31", freq="D", name="idx") idx2 = pd.date_range( @@ -119,8 +125,31 @@ def test_dti_custom_getitem_matplotlib_hackaround(self): expected = rng.values[:, None] tm.assert_numpy_array_equal(values, expected) + def test_getitem_int_list(self): + dti = date_range(start="1/1/2005", end="12/1/2005", freq="M") + dti2 = dti[[1, 3, 5]] + + v1 = dti2[0] + v2 = dti2[1] + v3 = dti2[2] + + assert v1 == Timestamp("2/28/2005") + assert v2 == Timestamp("4/30/2005") + assert v3 == Timestamp("6/30/2005") + + # getitem with non-slice drops freq + assert dti2.freq is None + class TestWhere: + def test_where_doesnt_retain_freq(self): + dti = date_range("20130101", periods=3, freq="D", name="idx") + cond = [True, True, False] + expected = DatetimeIndex([dti[0], dti[1], dti[0]], freq=None, name="idx") + + result = dti.where(cond, dti[::-1]) + tm.assert_index_equal(result, expected) + def test_where_other(self): # other is ndarray or Index i = pd.date_range("20130101", periods=3, tz="US/Eastern") @@ -304,7 +333,8 @@ def test_take_fill_value(self): with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) - with pytest.raises(IndexError): + msg = "out of bounds" + with pytest.raises(IndexError, match=msg): idx.take(np.array([1, -5])) def test_take_fill_value_with_timezone(self): @@ -340,7 +370,8 @@ def test_take_fill_value_with_timezone(self): with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) - with pytest.raises(IndexError): + msg = "out of bounds" + with pytest.raises(IndexError, match=msg): idx.take(np.array([1, -5])) @@ -420,7 +451,8 @@ def test_get_loc(self): tm.assert_numpy_array_equal( idx.get_loc(time(12, 30)), np.array([]), check_dtype=False ) - with pytest.raises(NotImplementedError): + msg = "cannot yet lookup inexact labels when key is a time object" + with pytest.raises(NotImplementedError, match=msg): idx.get_loc(time(12, 30), method="pad") def test_get_loc_tz_aware(self): @@ -454,7 +486,8 @@ def test_get_loc_nat(self): def test_get_loc_timedelta_invalid_key(self, key): # GH#20464 dti = pd.date_range("1970-01-01", periods=10) - with pytest.raises(TypeError): + msg = "Cannot index DatetimeIndex with [Tt]imedelta" + with pytest.raises(TypeError, match=msg): dti.get_loc(key) def test_get_loc_reasonable_key_error(self): @@ -464,301 +497,14 @@ def test_get_loc_reasonable_key_error(self): index.get_loc("1/1/2000") -class TestDatetimeIndex: - @pytest.mark.parametrize( - "null", [None, np.nan, np.datetime64("NaT"), pd.NaT, pd.NA] - ) - @pytest.mark.parametrize("tz", [None, "UTC", "US/Eastern"]) - def test_insert_nat(self, tz, null): - # GH#16537, GH#18295 (test missing) - idx = pd.DatetimeIndex(["2017-01-01"], tz=tz) - expected = pd.DatetimeIndex(["NaT", "2017-01-01"], tz=tz) - res = idx.insert(0, null) - tm.assert_index_equal(res, expected) - - @pytest.mark.parametrize("tz", [None, "UTC", "US/Eastern"]) - def test_insert_invalid_na(self, tz): - idx = pd.DatetimeIndex(["2017-01-01"], tz=tz) - with pytest.raises(TypeError, match="incompatible label"): - idx.insert(0, np.timedelta64("NaT")) - - def test_insert(self): - idx = DatetimeIndex(["2000-01-04", "2000-01-01", "2000-01-02"], name="idx") - - result = idx.insert(2, datetime(2000, 1, 5)) - exp = DatetimeIndex( - ["2000-01-04", "2000-01-01", "2000-01-05", "2000-01-02"], name="idx" - ) - tm.assert_index_equal(result, exp) - - # insertion of non-datetime should coerce to object index - result = idx.insert(1, "inserted") - expected = Index( - [ - datetime(2000, 1, 4), - "inserted", - datetime(2000, 1, 1), - datetime(2000, 1, 2), - ], - name="idx", - ) - assert not isinstance(result, DatetimeIndex) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - - idx = date_range("1/1/2000", periods=3, freq="M", name="idx") - - # preserve freq - expected_0 = DatetimeIndex( - ["1999-12-31", "2000-01-31", "2000-02-29", "2000-03-31"], - name="idx", - freq="M", - ) - expected_3 = DatetimeIndex( - ["2000-01-31", "2000-02-29", "2000-03-31", "2000-04-30"], - name="idx", - freq="M", - ) - - # reset freq to None - expected_1_nofreq = DatetimeIndex( - ["2000-01-31", "2000-01-31", "2000-02-29", "2000-03-31"], - name="idx", - freq=None, - ) - expected_3_nofreq = DatetimeIndex( - ["2000-01-31", "2000-02-29", "2000-03-31", "2000-01-02"], - name="idx", - freq=None, - ) - - cases = [ - (0, datetime(1999, 12, 31), expected_0), - (-3, datetime(1999, 12, 31), expected_0), - (3, datetime(2000, 4, 30), expected_3), - (1, datetime(2000, 1, 31), expected_1_nofreq), - (3, datetime(2000, 1, 2), expected_3_nofreq), - ] - - for n, d, expected in cases: - result = idx.insert(n, d) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - - # reset freq to None - result = idx.insert(3, datetime(2000, 1, 2)) - expected = DatetimeIndex( - ["2000-01-31", "2000-02-29", "2000-03-31", "2000-01-02"], - name="idx", - freq=None, - ) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq is None - - # see gh-7299 - idx = date_range("1/1/2000", periods=3, freq="D", tz="Asia/Tokyo", name="idx") - with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"): - idx.insert(3, pd.Timestamp("2000-01-04")) - with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"): - idx.insert(3, datetime(2000, 1, 4)) - with pytest.raises(ValueError): - idx.insert(3, pd.Timestamp("2000-01-04", tz="US/Eastern")) - with pytest.raises(ValueError): - idx.insert(3, datetime(2000, 1, 4, tzinfo=pytz.timezone("US/Eastern"))) - - for tz in ["US/Pacific", "Asia/Singapore"]: - idx = date_range("1/1/2000 09:00", periods=6, freq="H", tz=tz, name="idx") - # preserve freq - expected = date_range( - "1/1/2000 09:00", periods=7, freq="H", tz=tz, name="idx" - ) - for d in [ - pd.Timestamp("2000-01-01 15:00", tz=tz), - pytz.timezone(tz).localize(datetime(2000, 1, 1, 15)), - ]: - - result = idx.insert(6, d) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - assert result.tz == expected.tz - - expected = DatetimeIndex( - [ - "2000-01-01 09:00", - "2000-01-01 10:00", - "2000-01-01 11:00", - "2000-01-01 12:00", - "2000-01-01 13:00", - "2000-01-01 14:00", - "2000-01-01 10:00", - ], - name="idx", - tz=tz, - freq=None, - ) - # reset freq to None - for d in [ - pd.Timestamp("2000-01-01 10:00", tz=tz), - pytz.timezone(tz).localize(datetime(2000, 1, 1, 10)), - ]: - result = idx.insert(6, d) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.tz == expected.tz - assert result.freq is None - - def test_delete(self): - idx = date_range(start="2000-01-01", periods=5, freq="M", name="idx") - - # preserve freq - expected_0 = date_range(start="2000-02-01", periods=4, freq="M", name="idx") - expected_4 = date_range(start="2000-01-01", periods=4, freq="M", name="idx") - - # reset freq to None - expected_1 = DatetimeIndex( - ["2000-01-31", "2000-03-31", "2000-04-30", "2000-05-31"], - freq=None, - name="idx", - ) - - cases = { - 0: expected_0, - -5: expected_0, - -1: expected_4, - 4: expected_4, - 1: expected_1, - } - for n, expected in cases.items(): - result = idx.delete(n) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - - with pytest.raises((IndexError, ValueError)): - # either depending on numpy version - idx.delete(5) +class TestContains: + def test_dti_contains_with_duplicates(self): + d = datetime(2011, 12, 5, 20, 30) + ix = DatetimeIndex([d, d]) + assert d in ix - for tz in [None, "Asia/Tokyo", "US/Pacific"]: - idx = date_range( - start="2000-01-01 09:00", periods=10, freq="H", name="idx", tz=tz - ) - - expected = date_range( - start="2000-01-01 10:00", periods=9, freq="H", name="idx", tz=tz - ) - result = idx.delete(0) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freqstr == "H" - assert result.tz == expected.tz - - expected = date_range( - start="2000-01-01 09:00", periods=9, freq="H", name="idx", tz=tz - ) - result = idx.delete(-1) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freqstr == "H" - assert result.tz == expected.tz - - def test_delete_slice(self): - idx = date_range(start="2000-01-01", periods=10, freq="D", name="idx") - - # preserve freq - expected_0_2 = date_range(start="2000-01-04", periods=7, freq="D", name="idx") - expected_7_9 = date_range(start="2000-01-01", periods=7, freq="D", name="idx") - - # reset freq to None - expected_3_5 = DatetimeIndex( - [ - "2000-01-01", - "2000-01-02", - "2000-01-03", - "2000-01-07", - "2000-01-08", - "2000-01-09", - "2000-01-10", - ], - freq=None, - name="idx", - ) - - cases = { - (0, 1, 2): expected_0_2, - (7, 8, 9): expected_7_9, - (3, 4, 5): expected_3_5, - } - for n, expected in cases.items(): - result = idx.delete(n) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - - result = idx.delete(slice(n[0], n[-1] + 1)) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - - for tz in [None, "Asia/Tokyo", "US/Pacific"]: - ts = pd.Series( - 1, - index=pd.date_range( - "2000-01-01 09:00", periods=10, freq="H", name="idx", tz=tz - ), - ) - # preserve freq - result = ts.drop(ts.index[:5]).index - expected = pd.date_range( - "2000-01-01 14:00", periods=5, freq="H", name="idx", tz=tz - ) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - assert result.tz == expected.tz - - # reset freq to None - result = ts.drop(ts.index[[1, 3, 5, 7, 9]]).index - expected = DatetimeIndex( - [ - "2000-01-01 09:00", - "2000-01-01 11:00", - "2000-01-01 13:00", - "2000-01-01 15:00", - "2000-01-01 17:00", - ], - freq=None, - name="idx", - tz=tz, - ) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - assert result.tz == expected.tz - - def test_get_value(self): - # specifically make sure we have test for np.datetime64 key - dti = pd.date_range("2016-01-01", periods=3) - - arr = np.arange(6, 9) - ser = pd.Series(arr, index=dti) - - key = dti[1] - - with pytest.raises(AttributeError, match="has no attribute '_values'"): - dti.get_value(arr, key) - - result = dti.get_value(ser, key) - assert result == 7 - - result = dti.get_value(ser, key.to_pydatetime()) - assert result == 7 - - result = dti.get_value(ser, key.to_datetime64()) - assert result == 7 +class TestGetIndexer: def test_get_indexer(self): idx = pd.date_range("2000-01-01", periods=3) exp = np.array([0, 1, 2], dtype=np.intp) @@ -796,5 +542,49 @@ def test_get_indexer(self): ] with pytest.raises(ValueError, match="abbreviation w/o a number"): idx.get_indexer(target, "nearest", tolerance=tol_bad) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="abbreviation w/o a number"): idx.get_indexer(idx[[0]], method="nearest", tolerance="foo") + + +class TestMaybeCastSliceBound: + def test_maybe_cast_slice_bounds_empty(self): + # GH#14354 + empty_idx = date_range(freq="1H", periods=0, end="2015") + + right = empty_idx._maybe_cast_slice_bound("2015-01-02", "right", "loc") + exp = Timestamp("2015-01-02 23:59:59.999999999") + assert right == exp + + left = empty_idx._maybe_cast_slice_bound("2015-01-02", "left", "loc") + exp = Timestamp("2015-01-02 00:00:00") + assert left == exp + + def test_maybe_cast_slice_duplicate_monotonic(self): + # https://github.com/pandas-dev/pandas/issues/16515 + idx = DatetimeIndex(["2017", "2017"]) + result = idx._maybe_cast_slice_bound("2017-01-01", "left", "loc") + expected = Timestamp("2017-01-01") + assert result == expected + + +class TestDatetimeIndex: + def test_get_value(self): + # specifically make sure we have test for np.datetime64 key + dti = pd.date_range("2016-01-01", periods=3) + + arr = np.arange(6, 9) + ser = pd.Series(arr, index=dti) + + key = dti[1] + + with pytest.raises(AttributeError, match="has no attribute '_values'"): + dti.get_value(arr, key) + + result = dti.get_value(ser, key) + assert result == 7 + + result = dti.get_value(ser, key.to_pydatetime()) + assert result == 7 + + result = dti.get_value(ser, key.to_datetime64()) + assert result == 7 diff --git a/pandas/tests/indexes/datetimes/test_insert.py b/pandas/tests/indexes/datetimes/test_insert.py new file mode 100644 index 0000000000000..4abb4f0006444 --- /dev/null +++ b/pandas/tests/indexes/datetimes/test_insert.py @@ -0,0 +1,153 @@ +from datetime import datetime + +import numpy as np +import pytest +import pytz + +from pandas import NA, DatetimeIndex, Index, NaT, Timestamp, date_range +import pandas._testing as tm + + +class TestInsert: + @pytest.mark.parametrize("null", [None, np.nan, np.datetime64("NaT"), NaT, NA]) + @pytest.mark.parametrize("tz", [None, "UTC", "US/Eastern"]) + def test_insert_nat(self, tz, null): + # GH#16537, GH#18295 (test missing) + idx = DatetimeIndex(["2017-01-01"], tz=tz) + expected = DatetimeIndex(["NaT", "2017-01-01"], tz=tz) + res = idx.insert(0, null) + tm.assert_index_equal(res, expected) + + @pytest.mark.parametrize("tz", [None, "UTC", "US/Eastern"]) + def test_insert_invalid_na(self, tz): + idx = DatetimeIndex(["2017-01-01"], tz=tz) + with pytest.raises(TypeError, match="incompatible label"): + idx.insert(0, np.timedelta64("NaT")) + + def test_insert(self): + idx = DatetimeIndex(["2000-01-04", "2000-01-01", "2000-01-02"], name="idx") + + result = idx.insert(2, datetime(2000, 1, 5)) + exp = DatetimeIndex( + ["2000-01-04", "2000-01-01", "2000-01-05", "2000-01-02"], name="idx" + ) + tm.assert_index_equal(result, exp) + + # insertion of non-datetime should coerce to object index + result = idx.insert(1, "inserted") + expected = Index( + [ + datetime(2000, 1, 4), + "inserted", + datetime(2000, 1, 1), + datetime(2000, 1, 2), + ], + name="idx", + ) + assert not isinstance(result, DatetimeIndex) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + + idx = date_range("1/1/2000", periods=3, freq="M", name="idx") + + # preserve freq + expected_0 = DatetimeIndex( + ["1999-12-31", "2000-01-31", "2000-02-29", "2000-03-31"], + name="idx", + freq="M", + ) + expected_3 = DatetimeIndex( + ["2000-01-31", "2000-02-29", "2000-03-31", "2000-04-30"], + name="idx", + freq="M", + ) + + # reset freq to None + expected_1_nofreq = DatetimeIndex( + ["2000-01-31", "2000-01-31", "2000-02-29", "2000-03-31"], + name="idx", + freq=None, + ) + expected_3_nofreq = DatetimeIndex( + ["2000-01-31", "2000-02-29", "2000-03-31", "2000-01-02"], + name="idx", + freq=None, + ) + + cases = [ + (0, datetime(1999, 12, 31), expected_0), + (-3, datetime(1999, 12, 31), expected_0), + (3, datetime(2000, 4, 30), expected_3), + (1, datetime(2000, 1, 31), expected_1_nofreq), + (3, datetime(2000, 1, 2), expected_3_nofreq), + ] + + for n, d, expected in cases: + result = idx.insert(n, d) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + # reset freq to None + result = idx.insert(3, datetime(2000, 1, 2)) + expected = DatetimeIndex( + ["2000-01-31", "2000-02-29", "2000-03-31", "2000-01-02"], + name="idx", + freq=None, + ) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq is None + + # see gh-7299 + idx = date_range("1/1/2000", periods=3, freq="D", tz="Asia/Tokyo", name="idx") + with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"): + idx.insert(3, Timestamp("2000-01-04")) + with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"): + idx.insert(3, datetime(2000, 1, 4)) + with pytest.raises(ValueError, match="Timezones don't match"): + idx.insert(3, Timestamp("2000-01-04", tz="US/Eastern")) + with pytest.raises(ValueError, match="Timezones don't match"): + idx.insert(3, datetime(2000, 1, 4, tzinfo=pytz.timezone("US/Eastern"))) + + for tz in ["US/Pacific", "Asia/Singapore"]: + idx = date_range("1/1/2000 09:00", periods=6, freq="H", tz=tz, name="idx") + # preserve freq + expected = date_range( + "1/1/2000 09:00", periods=7, freq="H", tz=tz, name="idx" + ) + for d in [ + Timestamp("2000-01-01 15:00", tz=tz), + pytz.timezone(tz).localize(datetime(2000, 1, 1, 15)), + ]: + + result = idx.insert(6, d) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + assert result.tz == expected.tz + + expected = DatetimeIndex( + [ + "2000-01-01 09:00", + "2000-01-01 10:00", + "2000-01-01 11:00", + "2000-01-01 12:00", + "2000-01-01 13:00", + "2000-01-01 14:00", + "2000-01-01 10:00", + ], + name="idx", + tz=tz, + freq=None, + ) + # reset freq to None + for d in [ + Timestamp("2000-01-01 10:00", tz=tz), + pytz.timezone(tz).localize(datetime(2000, 1, 1, 10)), + ]: + result = idx.insert(6, d) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.tz == expected.tz + assert result.freq is None diff --git a/pandas/tests/indexes/datetimes/test_join.py b/pandas/tests/indexes/datetimes/test_join.py index f2f88fd7dc90c..9a9c94fa19e6d 100644 --- a/pandas/tests/indexes/datetimes/test_join.py +++ b/pandas/tests/indexes/datetimes/test_join.py @@ -64,7 +64,6 @@ def test_join_utc_convert(self, join_type): assert isinstance(result, DatetimeIndex) assert result.tz.zone == "UTC" - @pytest.mark.parametrize("sort", [None, False]) def test_datetimeindex_union_join_empty(self, sort): dti = date_range(start="1/1/2001", end="2/1/2001", freq="D") empty = Index([]) diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 8ed98410ad9a4..c55b0481c1041 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -1,16 +1,21 @@ from datetime import datetime -import warnings import numpy as np import pytest -from pandas.core.dtypes.generic import ABCDateOffset - import pandas as pd -from pandas import DatetimeIndex, Index, Series, Timestamp, bdate_range, date_range +from pandas import ( + DateOffset, + DatetimeIndex, + Index, + Series, + Timestamp, + bdate_range, + date_range, +) import pandas._testing as tm -from pandas.tseries.offsets import BDay, BMonthEnd, CDay, Day, Hour +from pandas.tseries.offsets import BDay, Day, Hour START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) @@ -259,9 +264,9 @@ def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture) tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) assert ordered.freq is None - def test_drop_duplicates_metadata(self): + def test_drop_duplicates_metadata(self, freq_sample): # GH 10115 - idx = pd.date_range("2011-01-01", "2011-01-31", freq="D", name="idx") + idx = pd.date_range("2011-01-01", freq=freq_sample, periods=10, name="idx") result = idx.drop_duplicates() tm.assert_index_equal(idx, result) assert idx.freq == result.freq @@ -272,57 +277,38 @@ def test_drop_duplicates_metadata(self): tm.assert_index_equal(idx, result) assert result.freq is None - def test_drop_duplicates(self): + @pytest.mark.parametrize( + "keep, expected, index", + [ + ("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)), + ("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)), + ( + False, + np.concatenate(([True] * 5, [False] * 5, [True] * 5)), + np.arange(5, 10), + ), + ], + ) + def test_drop_duplicates(self, freq_sample, keep, expected, index): # to check Index/Series compat - base = pd.date_range("2011-01-01", "2011-01-31", freq="D", name="idx") - idx = base.append(base[:5]) + idx = pd.date_range("2011-01-01", freq=freq_sample, periods=10, name="idx") + idx = idx.append(idx[:5]) - res = idx.drop_duplicates() - tm.assert_index_equal(res, base) - res = Series(idx).drop_duplicates() - tm.assert_series_equal(res, Series(base)) + tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected) + expected = idx[~expected] - res = idx.drop_duplicates(keep="last") - exp = base[5:].append(base[:5]) - tm.assert_index_equal(res, exp) - res = Series(idx).drop_duplicates(keep="last") - tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) + result = idx.drop_duplicates(keep=keep) + tm.assert_index_equal(result, expected) - res = idx.drop_duplicates(keep=False) - tm.assert_index_equal(res, base[5:]) - res = Series(idx).drop_duplicates(keep=False) - tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + result = Series(idx).drop_duplicates(keep=keep) + tm.assert_series_equal(result, Series(expected, index=index)) - @pytest.mark.parametrize( - "freq", - [ - "A", - "2A", - "-2A", - "Q", - "-1Q", - "M", - "-1M", - "D", - "3D", - "-3D", - "W", - "-1W", - "H", - "2H", - "-2H", - "T", - "2T", - "S", - "-3S", - ], - ) - def test_infer_freq(self, freq): + def test_infer_freq(self, freq_sample): # GH 11018 - idx = pd.date_range("2011-01-01 09:00:00", freq=freq, periods=10) + idx = pd.date_range("2011-01-01 09:00:00", freq=freq_sample, periods=10) result = pd.DatetimeIndex(idx.asi8, freq="infer") tm.assert_index_equal(idx, result) - assert result.freq == freq + assert result.freq == freq_sample def test_nat(self, tz_naive_fixture): tz = tz_naive_fixture @@ -363,7 +349,7 @@ def test_equals(self): assert not idx.equals(pd.Series(idx2)) # same internal, different tz - idx3 = pd.DatetimeIndex._simple_new(idx.asi8, tz="US/Pacific") + idx3 = pd.DatetimeIndex(idx.asi8, tz="US/Pacific") tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) assert not idx.equals(idx3) assert not idx.equals(idx3.copy()) @@ -394,7 +380,7 @@ def test_freq_setter(self, values, freq, tz): # can set to an offset, converting from string if necessary idx._data.freq = freq assert idx.freq == freq - assert isinstance(idx.freq, ABCDateOffset) + assert isinstance(idx.freq, DateOffset) # can reset to None idx._data.freq = None @@ -437,23 +423,6 @@ def test_copy(self): repr(cp) tm.assert_index_equal(cp, self.rng) - def test_shift(self): - shifted = self.rng.shift(5) - assert shifted[0] == self.rng[5] - assert shifted.freq == self.rng.freq - - shifted = self.rng.shift(-5) - assert shifted[5] == self.rng[0] - assert shifted.freq == self.rng.freq - - shifted = self.rng.shift(0) - assert shifted[0] == self.rng[0] - assert shifted.freq == self.rng.freq - - rng = date_range(START, END, freq=BMonthEnd()) - shifted = rng.shift(1, freq=BDay()) - assert shifted[0] == rng[0] + BDay() - def test_equals(self): assert not self.rng.equals(list(self.rng)) @@ -491,32 +460,6 @@ def test_copy(self): repr(cp) tm.assert_index_equal(cp, self.rng) - def test_shift(self): - - shifted = self.rng.shift(5) - assert shifted[0] == self.rng[5] - assert shifted.freq == self.rng.freq - - shifted = self.rng.shift(-5) - assert shifted[5] == self.rng[0] - assert shifted.freq == self.rng.freq - - shifted = self.rng.shift(0) - assert shifted[0] == self.rng[0] - assert shifted.freq == self.rng.freq - - with warnings.catch_warnings(record=True): - warnings.simplefilter("ignore", pd.errors.PerformanceWarning) - rng = date_range(START, END, freq=BMonthEnd()) - shifted = rng.shift(1, freq=CDay()) - assert shifted[0] == rng[0] + CDay() - - def test_shift_periods(self): - # GH#22458 : argument 'n' was deprecated in favor of 'periods' - idx = pd.date_range(start=START, end=END, periods=3) - tm.assert_index_equal(idx.shift(periods=0), idx) - tm.assert_index_equal(idx.shift(0), idx) - def test_pickle_unpickle(self): unpickled = tm.round_trip_pickle(self.rng) assert unpickled.freq is not None diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 946d658e90132..ddde30d0f8fbf 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -21,28 +21,6 @@ class TestSlicing: - def test_dti_slicing(self): - dti = date_range(start="1/1/2005", end="12/1/2005", freq="M") - dti2 = dti[[1, 3, 5]] - - v1 = dti2[0] - v2 = dti2[1] - v3 = dti2[2] - - assert v1 == Timestamp("2/28/2005") - assert v2 == Timestamp("4/30/2005") - assert v3 == Timestamp("6/30/2005") - - # don't carry freq through irregular slicing - assert dti2.freq is None - - def test_slice_keeps_name(self): - # GH4226 - st = pd.Timestamp("2013-07-01 00:00:00", tz="America/Los_Angeles") - et = pd.Timestamp("2013-07-02 00:00:00", tz="America/Los_Angeles") - dr = pd.date_range(st, et, freq="H", name="timebucket") - assert dr[1:].name == dr.name - def test_slice_with_negative_step(self): ts = Series(np.arange(20), date_range("2014-01-01", periods=20, freq="MS")) SLC = pd.IndexSlice @@ -80,25 +58,6 @@ def test_slice_with_zero_step_raises(self): with pytest.raises(ValueError, match="slice step cannot be zero"): ts.loc[::0] - def test_slice_bounds_empty(self): - # GH#14354 - empty_idx = date_range(freq="1H", periods=0, end="2015") - - right = empty_idx._maybe_cast_slice_bound("2015-01-02", "right", "loc") - exp = Timestamp("2015-01-02 23:59:59.999999999") - assert right == exp - - left = empty_idx._maybe_cast_slice_bound("2015-01-02", "left", "loc") - exp = Timestamp("2015-01-02 00:00:00") - assert left == exp - - def test_slice_duplicate_monotonic(self): - # https://github.com/pandas-dev/pandas/issues/16515 - idx = pd.DatetimeIndex(["2017", "2017"]) - result = idx._maybe_cast_slice_bound("2017-01-01", "left", "loc") - expected = Timestamp("2017-01-01") - assert result == expected - def test_monotone_DTI_indexing_bug(self): # GH 19362 # Testing accessing the first element in a monotonic descending diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index d58ecbad4c1b3..c088301097beb 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -33,7 +33,6 @@ class TestDatetimeIndexSetOps: ] # TODO: moved from test_datetimelike; dedup with version below - @pytest.mark.parametrize("sort", [None, False]) def test_union2(self, sort): everything = tm.makeDateIndex(10) first = everything[:5] @@ -42,7 +41,6 @@ def test_union2(self, sort): tm.assert_index_equal(union, everything) @pytest.mark.parametrize("box", [np.array, Series, list]) - @pytest.mark.parametrize("sort", [None, False]) def test_union3(self, sort, box): everything = tm.makeDateIndex(10) first = everything[:5] @@ -57,7 +55,6 @@ def test_union3(self, sort, box): tm.assert_index_equal(result, expected) @pytest.mark.parametrize("tz", tz) - @pytest.mark.parametrize("sort", [None, False]) def test_union(self, tz, sort): rng1 = pd.date_range("1/1/2000", freq="D", periods=5, tz=tz) other1 = pd.date_range("1/6/2000", freq="D", periods=5, tz=tz) @@ -89,7 +86,6 @@ def test_union(self, tz, sort): else: tm.assert_index_equal(result_union, exp_notsorted) - @pytest.mark.parametrize("sort", [None, False]) def test_union_coverage(self, sort): idx = DatetimeIndex(["2000-01-03", "2000-01-01", "2000-01-02"]) ordered = DatetimeIndex(idx.sort_values(), freq="infer") @@ -100,7 +96,6 @@ def test_union_coverage(self, sort): tm.assert_index_equal(result, ordered) assert result.freq == ordered.freq - @pytest.mark.parametrize("sort", [None, False]) def test_union_bug_1730(self, sort): rng_a = date_range("1/1/2012", periods=4, freq="3H") rng_b = date_range("1/1/2012", periods=4, freq="4H") @@ -113,7 +108,6 @@ def test_union_bug_1730(self, sort): exp = DatetimeIndex(exp) tm.assert_index_equal(result, exp) - @pytest.mark.parametrize("sort", [None, False]) def test_union_bug_1745(self, sort): left = DatetimeIndex(["2012-05-11 15:19:49.695000"]) right = DatetimeIndex( @@ -137,7 +131,6 @@ def test_union_bug_1745(self, sort): exp = exp.sort_values() tm.assert_index_equal(result, exp) - @pytest.mark.parametrize("sort", [None, False]) def test_union_bug_4564(self, sort): from pandas import DateOffset @@ -152,7 +145,6 @@ def test_union_bug_4564(self, sort): exp = DatetimeIndex(exp) tm.assert_index_equal(result, exp) - @pytest.mark.parametrize("sort", [None, False]) def test_union_freq_both_none(self, sort): # GH11086 expected = bdate_range("20150101", periods=10) @@ -188,7 +180,6 @@ def test_union_dataframe_index(self): exp = pd.date_range("1/1/1980", "1/1/2012", freq="MS") tm.assert_index_equal(df.index, exp) - @pytest.mark.parametrize("sort", [None, False]) def test_union_with_DatetimeIndex(self, sort): i1 = Int64Index(np.arange(0, 20, 2)) i2 = date_range(start="2012-01-03 00:00:00", periods=10, freq="D") @@ -218,7 +209,6 @@ def test_intersection2(self): @pytest.mark.parametrize( "tz", [None, "Asia/Tokyo", "US/Eastern", "dateutil/US/Pacific"] ) - @pytest.mark.parametrize("sort", [None, False]) def test_intersection(self, tz, sort): # GH 4690 (with tz) base = date_range("6/1/2000", "6/30/2000", freq="D", name="idx") @@ -298,7 +288,6 @@ def test_intersection_bug_1708(self): assert len(result) == 0 @pytest.mark.parametrize("tz", tz) - @pytest.mark.parametrize("sort", [None, False]) def test_difference(self, tz, sort): rng_dates = ["1/2/2000", "1/3/2000", "1/1/2000", "1/4/2000", "1/5/2000"] @@ -324,7 +313,6 @@ def test_difference(self, tz, sort): expected = expected.sort_values() tm.assert_index_equal(result_diff, expected) - @pytest.mark.parametrize("sort", [None, False]) def test_difference_freq(self, sort): # GH14323: difference of DatetimeIndex should not preserve frequency @@ -341,18 +329,51 @@ def test_difference_freq(self, sort): tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) - @pytest.mark.parametrize("sort", [None, False]) def test_datetimeindex_diff(self, sort): dti1 = date_range(freq="Q-JAN", start=datetime(1997, 12, 31), periods=100) dti2 = date_range(freq="Q-JAN", start=datetime(1997, 12, 31), periods=98) assert len(dti1.difference(dti2, sort)) == 2 + @pytest.mark.parametrize("tz", [None, "Asia/Tokyo", "US/Eastern"]) + def test_setops_preserve_freq(self, tz): + rng = date_range("1/1/2000", "1/1/2002", name="idx", tz=tz) + + result = rng[:50].union(rng[50:100]) + assert result.name == rng.name + assert result.freq == rng.freq + assert result.tz == rng.tz + + result = rng[:50].union(rng[30:100]) + assert result.name == rng.name + assert result.freq == rng.freq + assert result.tz == rng.tz + + result = rng[:50].union(rng[60:100]) + assert result.name == rng.name + assert result.freq is None + assert result.tz == rng.tz + + result = rng[:50].intersection(rng[25:75]) + assert result.name == rng.name + assert result.freqstr == "D" + assert result.tz == rng.tz + + nofreq = DatetimeIndex(list(rng[25:75]), name="other") + result = rng[:50].union(nofreq) + assert result.name is None + assert result.freq == rng.freq + assert result.tz == rng.tz + + result = rng[:50].intersection(nofreq) + assert result.name is None + assert result.freq == rng.freq + assert result.tz == rng.tz + class TestBusinessDatetimeIndex: def setup_method(self, method): self.rng = bdate_range(START, END) - @pytest.mark.parametrize("sort", [None, False]) def test_union(self, sort): # overlapping left = self.rng[:10] @@ -388,7 +409,6 @@ def test_union(self, sort): the_union = self.rng.union(rng, sort=sort) assert isinstance(the_union, DatetimeIndex) - @pytest.mark.parametrize("sort", [None, False]) def test_union_not_cacheable(self, sort): rng = date_range("1/1/2000", periods=50, freq=Minute()) rng1 = rng[10:] @@ -431,7 +451,6 @@ def test_intersection_bug(self): result = a.intersection(b) tm.assert_index_equal(result, b) - @pytest.mark.parametrize("sort", [None, False]) def test_month_range_union_tz_pytz(self, sort): from pytz import timezone @@ -449,7 +468,6 @@ def test_month_range_union_tz_pytz(self, sort): early_dr.union(late_dr, sort=sort) @td.skip_if_windows_python_3 - @pytest.mark.parametrize("sort", [None, False]) def test_month_range_union_tz_dateutil(self, sort): from pandas._libs.tslibs.timezones import dateutil_gettz @@ -471,7 +489,6 @@ class TestCustomDatetimeIndex: def setup_method(self, method): self.rng = bdate_range(START, END, freq="C") - @pytest.mark.parametrize("sort", [None, False]) def test_union(self, sort): # overlapping left = self.rng[:10] diff --git a/pandas/tests/indexes/datetimes/test_shift.py b/pandas/tests/indexes/datetimes/test_shift.py index 1c87995931c62..6e53492b71578 100644 --- a/pandas/tests/indexes/datetimes/test_shift.py +++ b/pandas/tests/indexes/datetimes/test_shift.py @@ -9,6 +9,8 @@ from pandas import DatetimeIndex, Series, date_range import pandas._testing as tm +START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) + class TestDatetimeIndexShift: @@ -80,7 +82,7 @@ def test_dti_shift_int(self): def test_dti_shift_no_freq(self): # GH#19147 dti = pd.DatetimeIndex(["2011-01-01 10:00", "2011-01-01"], freq=None) - with pytest.raises(NullFrequencyError): + with pytest.raises(NullFrequencyError, match="Cannot shift with no freq"): dti.shift(2) @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) @@ -115,3 +117,34 @@ def test_dti_shift_near_midnight(self, shift, result_time): result = s.shift(shift, freq="H") expected = Series(1, index=DatetimeIndex([result_time], tz="EST")) tm.assert_series_equal(result, expected) + + def test_shift_periods(self): + # GH#22458 : argument 'n' was deprecated in favor of 'periods' + idx = pd.date_range(start=START, end=END, periods=3) + tm.assert_index_equal(idx.shift(periods=0), idx) + tm.assert_index_equal(idx.shift(0), idx) + + @pytest.mark.parametrize("freq", ["B", "C"]) + def test_shift_bday(self, freq): + rng = date_range(START, END, freq=freq) + shifted = rng.shift(5) + assert shifted[0] == rng[5] + assert shifted.freq == rng.freq + + shifted = rng.shift(-5) + assert shifted[5] == rng[0] + assert shifted.freq == rng.freq + + shifted = rng.shift(0) + assert shifted[0] == rng[0] + assert shifted.freq == rng.freq + + def test_shift_bmonth(self): + rng = date_range(START, END, freq=pd.offsets.BMonthEnd()) + shifted = rng.shift(1, freq=pd.offsets.BDay()) + assert shifted[0] == rng[0] + pd.offsets.BDay() + + rng = date_range(START, END, freq=pd.offsets.BMonthEnd()) + with tm.assert_produces_warning(pd.errors.PerformanceWarning): + shifted = rng.shift(1, freq=pd.offsets.CDay()) + assert shifted[0] == rng[0] + pd.offsets.CDay() diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 9c1e8cb0f563f..fbddf765be79c 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -319,10 +319,10 @@ def test_dti_tz_localize_nonexistent_raise_coerce(self): times = ["2015-03-08 01:00", "2015-03-08 02:00", "2015-03-08 03:00"] index = DatetimeIndex(times) tz = "US/Eastern" - with pytest.raises(pytz.NonExistentTimeError): + with pytest.raises(pytz.NonExistentTimeError, match="|".join(times)): index.tz_localize(tz=tz) - with pytest.raises(pytz.NonExistentTimeError): + with pytest.raises(pytz.NonExistentTimeError, match="|".join(times)): index.tz_localize(tz=tz, nonexistent="raise") result = index.tz_localize(tz=tz, nonexistent="NaT") @@ -336,7 +336,7 @@ def test_dti_tz_localize_ambiguous_infer(self, tz): # November 6, 2011, fall back, repeat 2 AM hour # With no repeated hours, we cannot infer the transition dr = date_range(datetime(2011, 11, 6, 0), periods=5, freq=pd.offsets.Hour()) - with pytest.raises(pytz.AmbiguousTimeError): + with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): dr.tz_localize(tz) # With repeated hours, we can infer the transition @@ -365,7 +365,7 @@ def test_dti_tz_localize_ambiguous_infer(self, tz): def test_dti_tz_localize_ambiguous_times(self, tz): # March 13, 2011, spring forward, skip from 2 AM to 3 AM dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, freq=pd.offsets.Hour()) - with pytest.raises(pytz.NonExistentTimeError): + with pytest.raises(pytz.NonExistentTimeError, match="2011-03-13 02:30:00"): dr.tz_localize(tz) # after dst transition, it works @@ -375,7 +375,7 @@ def test_dti_tz_localize_ambiguous_times(self, tz): # November 6, 2011, fall back, repeat 2 AM hour dr = date_range(datetime(2011, 11, 6, 1, 30), periods=3, freq=pd.offsets.Hour()) - with pytest.raises(pytz.AmbiguousTimeError): + with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): dr.tz_localize(tz) # UTC is OK @@ -411,11 +411,11 @@ def test_dti_tz_localize(self, prefix): tm.assert_numpy_array_equal(dti3.values, dti_utc.values) dti = pd.date_range(start="11/6/2011 1:59", end="11/6/2011 2:00", freq="L") - with pytest.raises(pytz.AmbiguousTimeError): + with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): dti.tz_localize(tzstr) dti = pd.date_range(start="3/13/2011 1:59", end="3/13/2011 2:00", freq="L") - with pytest.raises(pytz.NonExistentTimeError): + with pytest.raises(pytz.NonExistentTimeError, match="2011-03-13 02:00:00"): dti.tz_localize(tzstr) @pytest.mark.parametrize( @@ -441,7 +441,7 @@ def test_dti_tz_localize_utc_conversion(self, tz): # DST ambiguity, this should fail rng = date_range("3/11/2012", "3/12/2012", freq="30T") # Is this really how it should fail?? - with pytest.raises(pytz.NonExistentTimeError): + with pytest.raises(pytz.NonExistentTimeError, match="2012-03-11 02:00:00"): rng.tz_localize(tz) def test_dti_tz_localize_roundtrip(self, tz_aware_fixture): @@ -452,7 +452,9 @@ def test_dti_tz_localize_roundtrip(self, tz_aware_fixture): tz = tz_aware_fixture localized = idx.tz_localize(tz) # cant localize a tz-aware object - with pytest.raises(TypeError): + with pytest.raises( + TypeError, match="Already tz-aware, use tz_convert to convert" + ): localized.tz_localize(tz) reset = localized.tz_localize(None) assert reset.tzinfo is None @@ -542,7 +544,8 @@ def test_dti_tz_localize_ambiguous_flags(self, tz): di = DatetimeIndex(times) # When the sizes are incompatible, make sure error is raised - with pytest.raises(Exception): + msg = "Length of ambiguous bool-array must be the same size as vals" + with pytest.raises(Exception, match=msg): di.tz_localize(tz, ambiguous=is_dst) # When sizes are compatible and there are repeats ('infer' won't work) @@ -564,7 +567,7 @@ def test_dti_construction_ambiguous_endpoint(self, tz): # construction with an ambiguous end-point # GH#11626 - with pytest.raises(pytz.AmbiguousTimeError): + with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): date_range( "2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", freq="H" ) @@ -588,7 +591,7 @@ def test_dti_construction_ambiguous_endpoint(self, tz): def test_dti_construction_nonexistent_endpoint(self, tz, option, expected): # construction with an nonexistent end-point - with pytest.raises(pytz.NonExistentTimeError): + with pytest.raises(pytz.NonExistentTimeError, match="2019-03-10 02:00:00"): date_range( "2019-03-10 00:00", "2019-03-10 02:00", tz="US/Pacific", freq="H" ) @@ -613,10 +616,15 @@ def test_dti_tz_localize_nonexistent(self, tz, method, exp): n = 60 dti = date_range(start="2015-03-29 02:00:00", periods=n, freq="min") if method == "raise": - with pytest.raises(pytz.NonExistentTimeError): + with pytest.raises(pytz.NonExistentTimeError, match="2015-03-29 02:00:00"): dti.tz_localize(tz, nonexistent=method) elif exp == "invalid": - with pytest.raises(ValueError): + msg = ( + "The nonexistent argument must be one of " + "'raise', 'NaT', 'shift_forward', 'shift_backward' " + "or a timedelta object" + ) + with pytest.raises(ValueError, match=msg): dti.tz_localize(tz, nonexistent=method) else: result = dti.tz_localize(tz, nonexistent=method) @@ -1082,7 +1090,8 @@ def test_with_tz(self, tz): dr = bdate_range( datetime(2005, 1, 1, tzinfo=pytz.utc), datetime(2009, 1, 1, tzinfo=pytz.utc) ) - with pytest.raises(Exception): + msg = "Start and end cannot both be tz-aware with different timezones" + with pytest.raises(Exception, match=msg): bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), "1/1/2009", tz=tz) @pytest.mark.parametrize("prefix", ["", "dateutil/"]) @@ -1152,74 +1161,3 @@ def test_iteration_preserves_nanoseconds(self, tz): ) for i, ts in enumerate(index): assert ts == index[i] - - -class TestDateRange: - """Tests for date_range with timezones""" - - def test_hongkong_tz_convert(self): - # GH#1673 smoke test - dr = date_range("2012-01-01", "2012-01-10", freq="D", tz="Hongkong") - - # it works! - dr.hour - - @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) - def test_date_range_span_dst_transition(self, tzstr): - # GH#1778 - - # Standard -> Daylight Savings Time - dr = date_range("03/06/2012 00:00", periods=200, freq="W-FRI", tz="US/Eastern") - - assert (dr.hour == 0).all() - - dr = date_range("2012-11-02", periods=10, tz=tzstr) - result = dr.hour - expected = Index([0] * 10) - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) - def test_date_range_timezone_str_argument(self, tzstr): - tz = timezones.maybe_get_tz(tzstr) - result = date_range("1/1/2000", periods=10, tz=tzstr) - expected = date_range("1/1/2000", periods=10, tz=tz) - - tm.assert_index_equal(result, expected) - - def test_date_range_with_fixedoffset_noname(self): - off = fixed_off_no_name - start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off) - end = datetime(2012, 6, 11, 5, 0, 0, tzinfo=off) - rng = date_range(start=start, end=end) - assert off == rng.tz - - idx = Index([start, end]) - assert off == idx.tz - - @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) - def test_date_range_with_tz(self, tzstr): - stamp = Timestamp("3/11/2012 05:00", tz=tzstr) - assert stamp.hour == 5 - - rng = date_range("3/11/2012 04:00", periods=10, freq="H", tz=tzstr) - - assert stamp == rng[1] - - -class TestToDatetime: - """Tests for the to_datetime constructor with timezones""" - - def test_to_datetime_utc(self): - arr = np.array([dateutil.parser.parse("2012-06-13T01:39:00Z")], dtype=object) - - result = to_datetime(arr, utc=True) - assert result.tz is pytz.utc - - def test_to_datetime_fixed_offset(self): - dates = [ - datetime(2000, 1, 1, tzinfo=fixed_off), - datetime(2000, 1, 2, tzinfo=fixed_off), - datetime(2000, 1, 3, tzinfo=fixed_off), - ] - result = to_datetime(dates) - assert result.tz == fixed_off diff --git a/pandas/tests/indexes/datetimes/test_to_period.py b/pandas/tests/indexes/datetimes/test_to_period.py index ddbb43787abb4..d82fc1ef6743b 100644 --- a/pandas/tests/indexes/datetimes/test_to_period.py +++ b/pandas/tests/indexes/datetimes/test_to_period.py @@ -1,3 +1,5 @@ +import warnings + import dateutil.tz from dateutil.tz import tzlocal import pytest @@ -75,6 +77,28 @@ def test_to_period_monthish(self): with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): date_range("01-Jan-2012", periods=8, freq="EOM") + def test_to_period_infer(self): + # https://github.com/pandas-dev/pandas/issues/33358 + rng = date_range( + start="2019-12-22 06:40:00+00:00", + end="2019-12-22 08:45:00+00:00", + freq="5min", + ) + + with tm.assert_produces_warning(None): + # Using simple filter because we are not checking for the warning here + warnings.simplefilter("ignore", UserWarning) + + pi1 = rng.to_period("5min") + + with tm.assert_produces_warning(None): + # Using simple filter because we are not checking for the warning here + warnings.simplefilter("ignore", UserWarning) + + pi2 = rng.to_period() + + tm.assert_index_equal(pi1, pi2) + def test_period_dt64_round_trip(self): dti = date_range("1/1/2000", "1/7/2002", freq="B") pi = dti.to_period() @@ -147,7 +171,8 @@ def test_to_period_tz_utc_offset_consistency(self, tz): def test_to_period_nofreq(self): idx = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-04"]) - with pytest.raises(ValueError): + msg = "You must pass a freq argument as current index has none." + with pytest.raises(ValueError, match=msg): idx.to_period() idx = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-03"], freq="infer") diff --git a/pandas/tests/indexes/interval/test_constructors.py b/pandas/tests/indexes/interval/test_constructors.py index 837c124db2bed..fa881df8139c6 100644 --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -91,7 +91,7 @@ def test_constructor_nan(self, constructor, breaks, closed): assert result.closed == closed assert result.dtype.subtype == expected_subtype - tm.assert_numpy_array_equal(result._ndarray_values, expected_values) + tm.assert_numpy_array_equal(np.array(result), expected_values) @pytest.mark.parametrize( "breaks", @@ -114,7 +114,7 @@ def test_constructor_empty(self, constructor, breaks, closed): assert result.empty assert result.closed == closed assert result.dtype.subtype == expected_subtype - tm.assert_numpy_array_equal(result._ndarray_values, expected_values) + tm.assert_numpy_array_equal(np.array(result), expected_values) @pytest.mark.parametrize( "breaks", diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index 0e5721bfd83fd..0e08a3f41b666 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -240,10 +240,10 @@ def test_get_indexer_length_one_interval(self, size, closed): ["foo", "foo", "bar", "baz"], ], ) - def test_get_indexer_categorical(self, target, ordered_fixture): + def test_get_indexer_categorical(self, target, ordered): # GH 30063: categorical and non-categorical results should be consistent index = IntervalIndex.from_tuples([(0, 1), (1, 2), (3, 4)]) - categorical_target = CategoricalIndex(target, ordered=ordered_fixture) + categorical_target = CategoricalIndex(target, ordered=ordered) result = index.get_indexer(categorical_target) expected = index.get_indexer(target) diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index c2b209c810af9..1b2bfa8573c21 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -147,7 +147,7 @@ def test_ensure_copied_data(self, closed): ) # by-definition make a copy - result = IntervalIndex(index._ndarray_values, copy=False) + result = IntervalIndex(np.array(index), copy=False) tm.assert_numpy_array_equal( index.left.values, result.left.values, check_same="copy" ) @@ -863,3 +863,25 @@ def test_dir(): index = IntervalIndex.from_arrays([0, 1], [1, 2]) result = dir(index) assert "str" not in result + + +@pytest.mark.parametrize("klass", [list, np.array, pd.array, pd.Series]) +def test_searchsorted_different_argument_classes(klass): + # https://github.com/pandas-dev/pandas/issues/32762 + values = IntervalIndex([Interval(0, 1), Interval(1, 2)]) + result = values.searchsorted(klass(values)) + expected = np.array([0, 1], dtype=result.dtype) + tm.assert_numpy_array_equal(result, expected) + + result = values._data.searchsorted(klass(values)) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "arg", [[1, 2], ["a", "b"], [pd.Timestamp("2020-01-01", tz="Europe/London")] * 2] +) +def test_searchsorted_invalid_argument(arg): + values = IntervalIndex([Interval(0, 1), Interval(1, 2)]) + msg = "unorderable types" + with pytest.raises(TypeError, match=msg): + values.searchsorted(arg) diff --git a/pandas/tests/indexes/interval/test_setops.py b/pandas/tests/indexes/interval/test_setops.py index d9359d717de1d..e3e5070064aff 100644 --- a/pandas/tests/indexes/interval/test_setops.py +++ b/pandas/tests/indexes/interval/test_setops.py @@ -10,11 +10,6 @@ def name(request): return request.param -@pytest.fixture(params=[None, False]) -def sort(request): - return request.param - - def monotonic_index(start, end, dtype="int64", closed="right"): return IntervalIndex.from_breaks(np.arange(start, end, dtype=dtype), closed=closed) @@ -153,7 +148,6 @@ def test_symmetric_difference(self, closed, sort): @pytest.mark.parametrize( "op_name", ["union", "intersection", "difference", "symmetric_difference"] ) - @pytest.mark.parametrize("sort", [None, False]) def test_set_incompatible_types(self, closed, op_name, sort): index = monotonic_index(0, 11, closed=closed) set_op = getattr(index, op_name) diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py index a9e02934f27ab..cd98a87459061 100644 --- a/pandas/tests/indexes/multi/test_analytics.py +++ b/pandas/tests/indexes/multi/test_analytics.py @@ -57,23 +57,6 @@ def test_truncate(): index.truncate(3, 1) -def test_where(): - i = MultiIndex.from_tuples([("A", 1), ("A", 2)]) - - msg = r"\.where is not supported for MultiIndex operations" - with pytest.raises(NotImplementedError, match=msg): - i.where(True) - - -@pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series]) -def test_where_array_like(klass): - i = MultiIndex.from_tuples([("A", 1), ("A", 2)]) - cond = [False, True] - msg = r"\.where is not supported for MultiIndex operations" - with pytest.raises(NotImplementedError, match=msg): - i.where(klass(cond)) - - # TODO: reshape diff --git a/pandas/tests/indexes/multi/test_compat.py b/pandas/tests/indexes/multi/test_compat.py index ef549beccda5d..9273de9c20412 100644 --- a/pandas/tests/indexes/multi/test_compat.py +++ b/pandas/tests/indexes/multi/test_compat.py @@ -37,7 +37,11 @@ def test_logical_compat(idx, method): def test_boolean_context_compat(idx): - with pytest.raises(ValueError): + msg = ( + "The truth value of a MultiIndex is ambiguous. " + r"Use a.empty, a.bool\(\), a.item\(\), a.any\(\) or a.all\(\)." + ) + with pytest.raises(ValueError, match=msg): bool(idx) diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index 2c4b3ce04f96d..1157c7f8bb962 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -1,3 +1,6 @@ +from datetime import date, datetime +import itertools + import numpy as np import pytest @@ -6,7 +9,7 @@ from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike import pandas as pd -from pandas import Index, MultiIndex, date_range +from pandas import Index, MultiIndex, Series, date_range import pandas._testing as tm @@ -723,3 +726,73 @@ def test_index_equal_empty_iterable(): a = MultiIndex(levels=[[], []], codes=[[], []], names=["a", "b"]) b = MultiIndex.from_arrays(arrays=[[], []], names=["a", "b"]) tm.assert_index_equal(a, b) + + +def test_raise_invalid_sortorder(): + # Test that the MultiIndex constructor raise when a incorrect sortorder is given + # GH#28518 + + levels = [[0, 1], [0, 1, 2]] + + # Correct sortorder + MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], sortorder=2 + ) + + with pytest.raises(ValueError, match=r".* sortorder 2 with lexsort_depth 1.*"): + MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], sortorder=2, + ) + + with pytest.raises(ValueError, match=r".* sortorder 1 with lexsort_depth 0.*"): + MultiIndex( + levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], sortorder=1, + ) + + +def test_datetimeindex(): + idx1 = pd.DatetimeIndex( + ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"] * 2, tz="Asia/Tokyo", + ) + idx2 = pd.date_range("2010/01/01", periods=6, freq="M", tz="US/Eastern") + idx = MultiIndex.from_arrays([idx1, idx2]) + + expected1 = pd.DatetimeIndex( + ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"], tz="Asia/Tokyo" + ) + + tm.assert_index_equal(idx.levels[0], expected1) + tm.assert_index_equal(idx.levels[1], idx2) + + # from datetime combos + # GH 7888 + date1 = date.today() + date2 = datetime.today() + date3 = Timestamp.today() + + for d1, d2 in itertools.product([date1, date2, date3], [date1, date2, date3]): + index = MultiIndex.from_product([[d1], [d2]]) + assert isinstance(index.levels[0], pd.DatetimeIndex) + assert isinstance(index.levels[1], pd.DatetimeIndex) + + +def test_constructor_with_tz(): + + index = pd.DatetimeIndex( + ["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific" + ) + columns = pd.DatetimeIndex( + ["2014/01/01 09:00", "2014/01/02 09:00"], name="dt2", tz="Asia/Tokyo" + ) + + result = MultiIndex.from_arrays([index, columns]) + + assert result.names == ["dt1", "dt2"] + tm.assert_index_equal(result.levels[0], index) + tm.assert_index_equal(result.levels[1], columns) + + result = MultiIndex.from_arrays([Series(index), Series(columns)]) + + assert result.names == ["dt1", "dt2"] + tm.assert_index_equal(result.levels[0], index) + tm.assert_index_equal(result.levels[1], columns) diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index bfc432a18458a..3519c5d0d5a9a 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -2,16 +2,10 @@ import pytest import pandas as pd -from pandas import DataFrame, MultiIndex, date_range +from pandas import DataFrame, MultiIndex import pandas._testing as tm -def test_tolist(idx): - result = idx.tolist() - exp = list(idx.values) - assert result == exp - - def test_to_numpy(idx): result = idx.to_numpy() exp = idx.values @@ -129,47 +123,6 @@ def test_to_frame_resulting_column_order(): assert result == expected -def test_roundtrip_pickle_with_tz(): - return # FIXME: this can't be right? - - # GH 8367 - # round-trip of timezone - index = MultiIndex.from_product( - [[1, 2], ["a", "b"], date_range("20130101", periods=3, tz="US/Eastern")], - names=["one", "two", "three"], - ) - unpickled = tm.round_trip_pickle(index) - assert index.equal_levels(unpickled) - - -def test_to_series(idx): - # assert that we are creating a copy of the index - - s = idx.to_series() - assert s.values is not idx.values - assert s.index is not idx - assert s.name == idx.name - - -def test_to_series_with_arguments(idx): - # GH18699 - - # index kwarg - s = idx.to_series(index=idx) - - assert s.values is not idx.values - assert s.index is idx - assert s.name == idx.name - - # name kwarg - idx = idx - s = idx.to_series(name="__test") - - assert s.values is not idx.values - assert s.index is not idx - assert s.name != idx.name - - def test_to_flat_index(idx): expected = pd.Index( ( diff --git a/pandas/tests/indexes/multi/test_drop.py b/pandas/tests/indexes/multi/test_drop.py index b909025b3f2f9..6ba565f0406ab 100644 --- a/pandas/tests/indexes/multi/test_drop.py +++ b/pandas/tests/indexes/multi/test_drop.py @@ -139,52 +139,3 @@ def test_drop_not_lexsorted(): tm.assert_index_equal(lexsorted_mi, not_lexsorted_mi) with tm.assert_produces_warning(PerformanceWarning): tm.assert_index_equal(lexsorted_mi.drop("a"), not_lexsorted_mi.drop("a")) - - -@pytest.mark.parametrize( - "msg,labels,level", - [ - (r"labels \[4\] not found in level", 4, "a"), - (r"labels \[7\] not found in level", 7, "b"), - ], -) -def test_drop_raise_exception_if_labels_not_in_level(msg, labels, level): - # GH 8594 - mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"]) - s = pd.Series([10, 20, 30], index=mi) - df = pd.DataFrame([10, 20, 30], index=mi) - - with pytest.raises(KeyError, match=msg): - s.drop(labels, level=level) - with pytest.raises(KeyError, match=msg): - df.drop(labels, level=level) - - -@pytest.mark.parametrize("labels,level", [(4, "a"), (7, "b")]) -def test_drop_errors_ignore(labels, level): - # GH 8594 - mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"]) - s = pd.Series([10, 20, 30], index=mi) - df = pd.DataFrame([10, 20, 30], index=mi) - - expected_s = s.drop(labels, level=level, errors="ignore") - tm.assert_series_equal(s, expected_s) - - expected_df = df.drop(labels, level=level, errors="ignore") - tm.assert_frame_equal(df, expected_df) - - -def test_drop_with_non_unique_datetime_index_and_invalid_keys(): - # GH 30399 - - # define dataframe with unique datetime index - df = pd.DataFrame( - np.random.randn(5, 3), - columns=["a", "b", "c"], - index=pd.date_range("2012", freq="H", periods=5), - ) - # create dataframe with non-unique datetime index - df = df.iloc[[0, 2, 2, 3]].copy() - - with pytest.raises(KeyError, match="not found in axis"): - df.drop(["a", "b"]) # Dropping with labels not exist in the index diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index 93e1de535835f..e48731b9c8099 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -83,12 +83,14 @@ def test_get_unique_index(idx, dropna): def test_duplicate_multiindex_codes(): # GH 17464 # Make sure that a MultiIndex with duplicate levels throws a ValueError - with pytest.raises(ValueError): + msg = r"Level values must be unique: \[[A', ]+\] on level 0" + with pytest.raises(ValueError, match=msg): mi = MultiIndex([["A"] * 10, range(10)], [[0] * 10, range(10)]) # And that using set_levels with duplicate levels fails mi = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]]) - with pytest.raises(ValueError): + msg = r"Level values must be unique: \[[AB', ]+\] on level 0" + with pytest.raises(ValueError, match=msg): mi.set_levels([["A", "B", "A", "A", "B"], [2, 1, 3, -2, 5]], inplace=True) @@ -238,7 +240,6 @@ def test_duplicated(idx_dup, keep, expected): tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("keep", ["first", "last", False]) def test_duplicated_large(keep): # GH 9125 n, k = 200, 5000 @@ -274,3 +275,29 @@ def test_duplicated2(): tm.assert_numpy_array_equal( mi.duplicated(), np.zeros(len(mi), dtype="bool") ) + + +def test_duplicated_drop_duplicates(): + # GH#4060 + idx = MultiIndex.from_arrays(([1, 2, 3, 1, 2, 3], [1, 1, 1, 1, 2, 2])) + + expected = np.array([False, False, False, True, False, False], dtype=bool) + duplicated = idx.duplicated() + tm.assert_numpy_array_equal(duplicated, expected) + assert duplicated.dtype == bool + expected = MultiIndex.from_arrays(([1, 2, 3, 2, 3], [1, 1, 1, 2, 2])) + tm.assert_index_equal(idx.drop_duplicates(), expected) + + expected = np.array([True, False, False, False, False, False]) + duplicated = idx.duplicated(keep="last") + tm.assert_numpy_array_equal(duplicated, expected) + assert duplicated.dtype == bool + expected = MultiIndex.from_arrays(([2, 3, 1, 2, 3], [1, 1, 1, 2, 2])) + tm.assert_index_equal(idx.drop_duplicates(keep="last"), expected) + + expected = np.array([True, False, False, True, False, False]) + duplicated = idx.duplicated(keep=False) + tm.assert_numpy_array_equal(duplicated, expected) + assert duplicated.dtype == bool + expected = MultiIndex.from_arrays(([2, 3, 2, 3], [1, 1, 2, 2])) + tm.assert_index_equal(idx.drop_duplicates(keep=False), expected) diff --git a/pandas/tests/indexes/multi/test_format.py b/pandas/tests/indexes/multi/test_formats.py similarity index 93% rename from pandas/tests/indexes/multi/test_format.py rename to pandas/tests/indexes/multi/test_formats.py index 75f23fb2f32ba..792dcf4c535e3 100644 --- a/pandas/tests/indexes/multi/test_format.py +++ b/pandas/tests/indexes/multi/test_formats.py @@ -1,9 +1,10 @@ import warnings +import numpy as np import pytest import pandas as pd -from pandas import MultiIndex +from pandas import Index, MultiIndex import pandas._testing as tm @@ -57,7 +58,8 @@ def test_repr_with_unicode_data(): def test_repr_roundtrip_raises(): mi = MultiIndex.from_product([list("ab"), range(3)], names=["first", "second"]) - with pytest.raises(TypeError): + msg = "Must pass both levels and codes" + with pytest.raises(TypeError, match=msg): eval(repr(mi)) @@ -76,6 +78,17 @@ def test_repr_max_seq_item_setting(idx): class TestRepr: + def test_unicode_repr_issues(self): + levels = [Index(["a/\u03c3", "b/\u03c3", "c/\u03c3"]), Index([0, 1])] + codes = [np.arange(3).repeat(2), np.tile(np.arange(2), 3)] + index = MultiIndex(levels=levels, codes=codes) + + repr(index.levels) + + # FIXME: dont leave commented-out + # NumPy bug + # repr(index.get_level_values(1)) + def test_repr(self, idx): result = idx[:1].__repr__() expected = """\ diff --git a/pandas/tests/indexes/multi/test_get_level_values.py b/pandas/tests/indexes/multi/test_get_level_values.py new file mode 100644 index 0000000000000..985fe5773ceed --- /dev/null +++ b/pandas/tests/indexes/multi/test_get_level_values.py @@ -0,0 +1,105 @@ +import numpy as np + +import pandas as pd +from pandas import CategoricalIndex, Index, MultiIndex, Timestamp, date_range +import pandas._testing as tm + + +class TestGetLevelValues: + def test_get_level_values_box_datetime64(self): + + dates = date_range("1/1/2000", periods=4) + levels = [dates, [0, 1]] + codes = [[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]] + + index = MultiIndex(levels=levels, codes=codes) + + assert isinstance(index.get_level_values(0)[0], Timestamp) + + +def test_get_level_values(idx): + result = idx.get_level_values(0) + expected = Index(["foo", "foo", "bar", "baz", "qux", "qux"], name="first") + tm.assert_index_equal(result, expected) + assert result.name == "first" + + result = idx.get_level_values("first") + expected = idx.get_level_values(0) + tm.assert_index_equal(result, expected) + + # GH 10460 + index = MultiIndex( + levels=[CategoricalIndex(["A", "B"]), CategoricalIndex([1, 2, 3])], + codes=[np.array([0, 0, 0, 1, 1, 1]), np.array([0, 1, 2, 0, 1, 2])], + ) + + exp = CategoricalIndex(["A", "A", "A", "B", "B", "B"]) + tm.assert_index_equal(index.get_level_values(0), exp) + exp = CategoricalIndex([1, 2, 3, 1, 2, 3]) + tm.assert_index_equal(index.get_level_values(1), exp) + + +def test_get_level_values_all_na(): + # GH#17924 when level entirely consists of nan + arrays = [[np.nan, np.nan, np.nan], ["a", np.nan, 1]] + index = pd.MultiIndex.from_arrays(arrays) + result = index.get_level_values(0) + expected = pd.Index([np.nan, np.nan, np.nan], dtype=np.float64) + tm.assert_index_equal(result, expected) + + result = index.get_level_values(1) + expected = pd.Index(["a", np.nan, 1], dtype=object) + tm.assert_index_equal(result, expected) + + +def test_get_level_values_int_with_na(): + # GH#17924 + arrays = [["a", "b", "b"], [1, np.nan, 2]] + index = pd.MultiIndex.from_arrays(arrays) + result = index.get_level_values(1) + expected = Index([1, np.nan, 2]) + tm.assert_index_equal(result, expected) + + arrays = [["a", "b", "b"], [np.nan, np.nan, 2]] + index = pd.MultiIndex.from_arrays(arrays) + result = index.get_level_values(1) + expected = Index([np.nan, np.nan, 2]) + tm.assert_index_equal(result, expected) + + +def test_get_level_values_na(): + arrays = [[np.nan, np.nan, np.nan], ["a", np.nan, 1]] + index = pd.MultiIndex.from_arrays(arrays) + result = index.get_level_values(0) + expected = pd.Index([np.nan, np.nan, np.nan]) + tm.assert_index_equal(result, expected) + + result = index.get_level_values(1) + expected = pd.Index(["a", np.nan, 1]) + tm.assert_index_equal(result, expected) + + arrays = [["a", "b", "b"], pd.DatetimeIndex([0, 1, pd.NaT])] + index = pd.MultiIndex.from_arrays(arrays) + result = index.get_level_values(1) + expected = pd.DatetimeIndex([0, 1, pd.NaT]) + tm.assert_index_equal(result, expected) + + arrays = [[], []] + index = pd.MultiIndex.from_arrays(arrays) + result = index.get_level_values(0) + expected = pd.Index([], dtype=object) + tm.assert_index_equal(result, expected) + + +def test_get_level_values_when_periods(): + # GH33131. See also discussion in GH32669. + # This test can probably be removed when PeriodIndex._engine is removed. + from pandas import Period, PeriodIndex + + idx = MultiIndex.from_arrays( + [PeriodIndex([Period("2019Q1"), Period("2019Q2")], name="b")] + ) + idx2 = MultiIndex.from_arrays( + [idx._get_level_values(level) for level in range(idx.nlevels)] + ) + assert all(x.is_monotonic for x in idx2.levels) diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index 675a1e2e832f3..8a3deca0236e4 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import CategoricalIndex, Index, MultiIndex +from pandas import CategoricalIndex, MultiIndex import pandas._testing as tm @@ -27,90 +27,6 @@ def test_get_level_number_integer(idx): idx._get_level_number("fourth") -def test_get_level_values(idx): - result = idx.get_level_values(0) - expected = Index(["foo", "foo", "bar", "baz", "qux", "qux"], name="first") - tm.assert_index_equal(result, expected) - assert result.name == "first" - - result = idx.get_level_values("first") - expected = idx.get_level_values(0) - tm.assert_index_equal(result, expected) - - # GH 10460 - index = MultiIndex( - levels=[CategoricalIndex(["A", "B"]), CategoricalIndex([1, 2, 3])], - codes=[np.array([0, 0, 0, 1, 1, 1]), np.array([0, 1, 2, 0, 1, 2])], - ) - - exp = CategoricalIndex(["A", "A", "A", "B", "B", "B"]) - tm.assert_index_equal(index.get_level_values(0), exp) - exp = CategoricalIndex([1, 2, 3, 1, 2, 3]) - tm.assert_index_equal(index.get_level_values(1), exp) - - -def test_get_value_duplicates(): - index = MultiIndex( - levels=[["D", "B", "C"], [0, 26, 27, 37, 57, 67, 75, 82]], - codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], - names=["tag", "day"], - ) - - assert index.get_loc("D") == slice(0, 3) - - -def test_get_level_values_all_na(): - # GH 17924 when level entirely consists of nan - arrays = [[np.nan, np.nan, np.nan], ["a", np.nan, 1]] - index = pd.MultiIndex.from_arrays(arrays) - result = index.get_level_values(0) - expected = pd.Index([np.nan, np.nan, np.nan], dtype=np.float64) - tm.assert_index_equal(result, expected) - - result = index.get_level_values(1) - expected = pd.Index(["a", np.nan, 1], dtype=object) - tm.assert_index_equal(result, expected) - - -def test_get_level_values_int_with_na(): - # GH 17924 - arrays = [["a", "b", "b"], [1, np.nan, 2]] - index = pd.MultiIndex.from_arrays(arrays) - result = index.get_level_values(1) - expected = Index([1, np.nan, 2]) - tm.assert_index_equal(result, expected) - - arrays = [["a", "b", "b"], [np.nan, np.nan, 2]] - index = pd.MultiIndex.from_arrays(arrays) - result = index.get_level_values(1) - expected = Index([np.nan, np.nan, 2]) - tm.assert_index_equal(result, expected) - - -def test_get_level_values_na(): - arrays = [[np.nan, np.nan, np.nan], ["a", np.nan, 1]] - index = pd.MultiIndex.from_arrays(arrays) - result = index.get_level_values(0) - expected = pd.Index([np.nan, np.nan, np.nan]) - tm.assert_index_equal(result, expected) - - result = index.get_level_values(1) - expected = pd.Index(["a", np.nan, 1]) - tm.assert_index_equal(result, expected) - - arrays = [["a", "b", "b"], pd.DatetimeIndex([0, 1, pd.NaT])] - index = pd.MultiIndex.from_arrays(arrays) - result = index.get_level_values(1) - expected = pd.DatetimeIndex([0, 1, pd.NaT]) - tm.assert_index_equal(result, expected) - - arrays = [[], []] - index = pd.MultiIndex.from_arrays(arrays) - result = index.get_level_values(0) - expected = pd.Index([], dtype=object) - tm.assert_index_equal(result, expected) - - def test_set_name_methods(idx, index_names): # so long as these are synonyms, we don't need to test set_names assert idx.rename == idx.set_names diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 39049006edb7c..8c0dae433c8f4 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -239,6 +239,203 @@ def test_get_indexer_with_missing_value(self, index_arr, labels, expected): result = idx.get_indexer(labels) tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_methods(self): + # https://github.com/pandas-dev/pandas/issues/29896 + # test getting an indexer for another index with different methods + # confirms that getting an indexer without a filling method, getting an + # indexer and backfilling, and getting an indexer and padding all behave + # correctly in the case where all of the target values fall in between + # several levels in the MultiIndex into which they are getting an indexer + # + # visually, the MultiIndexes used in this test are: + # mult_idx_1: + # 0: -1 0 + # 1: 2 + # 2: 3 + # 3: 4 + # 4: 0 0 + # 5: 2 + # 6: 3 + # 7: 4 + # 8: 1 0 + # 9: 2 + # 10: 3 + # 11: 4 + # + # mult_idx_2: + # 0: 0 1 + # 1: 3 + # 2: 4 + mult_idx_1 = MultiIndex.from_product([[-1, 0, 1], [0, 2, 3, 4]]) + mult_idx_2 = MultiIndex.from_product([[0], [1, 3, 4]]) + + indexer = mult_idx_1.get_indexer(mult_idx_2) + expected = np.array([-1, 6, 7], dtype=indexer.dtype) + tm.assert_almost_equal(expected, indexer) + + backfill_indexer = mult_idx_1.get_indexer(mult_idx_2, method="backfill") + expected = np.array([5, 6, 7], dtype=backfill_indexer.dtype) + tm.assert_almost_equal(expected, backfill_indexer) + + # ensure the legacy "bfill" option functions identically to "backfill" + backfill_indexer = mult_idx_1.get_indexer(mult_idx_2, method="bfill") + expected = np.array([5, 6, 7], dtype=backfill_indexer.dtype) + tm.assert_almost_equal(expected, backfill_indexer) + + pad_indexer = mult_idx_1.get_indexer(mult_idx_2, method="pad") + expected = np.array([4, 6, 7], dtype=pad_indexer.dtype) + tm.assert_almost_equal(expected, pad_indexer) + + # ensure the legacy "ffill" option functions identically to "pad" + pad_indexer = mult_idx_1.get_indexer(mult_idx_2, method="ffill") + expected = np.array([4, 6, 7], dtype=pad_indexer.dtype) + tm.assert_almost_equal(expected, pad_indexer) + + def test_get_indexer_three_or_more_levels(self): + # https://github.com/pandas-dev/pandas/issues/29896 + # tests get_indexer() on MultiIndexes with 3+ levels + # visually, these are + # mult_idx_1: + # 0: 1 2 5 + # 1: 7 + # 2: 4 5 + # 3: 7 + # 4: 6 5 + # 5: 7 + # 6: 3 2 5 + # 7: 7 + # 8: 4 5 + # 9: 7 + # 10: 6 5 + # 11: 7 + # + # mult_idx_2: + # 0: 1 1 8 + # 1: 1 5 9 + # 2: 1 6 7 + # 3: 2 1 6 + # 4: 2 7 6 + # 5: 2 7 8 + # 6: 3 6 8 + mult_idx_1 = pd.MultiIndex.from_product([[1, 3], [2, 4, 6], [5, 7]]) + mult_idx_2 = pd.MultiIndex.from_tuples( + [ + (1, 1, 8), + (1, 5, 9), + (1, 6, 7), + (2, 1, 6), + (2, 7, 7), + (2, 7, 8), + (3, 6, 8), + ] + ) + # sanity check + assert mult_idx_1.is_monotonic + assert mult_idx_1.is_unique + assert mult_idx_2.is_monotonic + assert mult_idx_2.is_unique + + # show the relationships between the two + assert mult_idx_2[0] < mult_idx_1[0] + assert mult_idx_1[3] < mult_idx_2[1] < mult_idx_1[4] + assert mult_idx_1[5] == mult_idx_2[2] + assert mult_idx_1[5] < mult_idx_2[3] < mult_idx_1[6] + assert mult_idx_1[5] < mult_idx_2[4] < mult_idx_1[6] + assert mult_idx_1[5] < mult_idx_2[5] < mult_idx_1[6] + assert mult_idx_1[-1] < mult_idx_2[6] + + indexer_no_fill = mult_idx_1.get_indexer(mult_idx_2) + expected = np.array([-1, -1, 5, -1, -1, -1, -1], dtype=indexer_no_fill.dtype) + tm.assert_almost_equal(expected, indexer_no_fill) + + # test with backfilling + indexer_backfilled = mult_idx_1.get_indexer(mult_idx_2, method="backfill") + expected = np.array([0, 4, 5, 6, 6, 6, -1], dtype=indexer_backfilled.dtype) + tm.assert_almost_equal(expected, indexer_backfilled) + + # now, the same thing, but forward-filled (aka "padded") + indexer_padded = mult_idx_1.get_indexer(mult_idx_2, method="pad") + expected = np.array([-1, 3, 5, 5, 5, 5, 11], dtype=indexer_padded.dtype) + tm.assert_almost_equal(expected, indexer_padded) + + # now, do the indexing in the other direction + assert mult_idx_2[0] < mult_idx_1[0] < mult_idx_2[1] + assert mult_idx_2[0] < mult_idx_1[1] < mult_idx_2[1] + assert mult_idx_2[0] < mult_idx_1[2] < mult_idx_2[1] + assert mult_idx_2[0] < mult_idx_1[3] < mult_idx_2[1] + assert mult_idx_2[1] < mult_idx_1[4] < mult_idx_2[2] + assert mult_idx_2[2] == mult_idx_1[5] + assert mult_idx_2[5] < mult_idx_1[6] < mult_idx_2[6] + assert mult_idx_2[5] < mult_idx_1[7] < mult_idx_2[6] + assert mult_idx_2[5] < mult_idx_1[8] < mult_idx_2[6] + assert mult_idx_2[5] < mult_idx_1[9] < mult_idx_2[6] + assert mult_idx_2[5] < mult_idx_1[10] < mult_idx_2[6] + assert mult_idx_2[5] < mult_idx_1[11] < mult_idx_2[6] + + indexer = mult_idx_2.get_indexer(mult_idx_1) + expected = np.array( + [-1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1], dtype=indexer.dtype + ) + tm.assert_almost_equal(expected, indexer) + + backfill_indexer = mult_idx_2.get_indexer(mult_idx_1, method="bfill") + expected = np.array( + [1, 1, 1, 1, 2, 2, 6, 6, 6, 6, 6, 6], dtype=backfill_indexer.dtype + ) + tm.assert_almost_equal(expected, backfill_indexer) + + pad_indexer = mult_idx_2.get_indexer(mult_idx_1, method="pad") + expected = np.array( + [0, 0, 0, 0, 1, 2, 5, 5, 5, 5, 5, 5], dtype=pad_indexer.dtype + ) + tm.assert_almost_equal(expected, pad_indexer) + + def test_get_indexer_crossing_levels(self): + # https://github.com/pandas-dev/pandas/issues/29896 + # tests a corner case with get_indexer() with MultiIndexes where, when we + # need to "carry" across levels, proper tuple ordering is respected + # + # the MultiIndexes used in this test, visually, are: + # mult_idx_1: + # 0: 1 1 1 1 + # 1: 2 + # 2: 2 1 + # 3: 2 + # 4: 1 2 1 1 + # 5: 2 + # 6: 2 1 + # 7: 2 + # 8: 2 1 1 1 + # 9: 2 + # 10: 2 1 + # 11: 2 + # 12: 2 2 1 1 + # 13: 2 + # 14: 2 1 + # 15: 2 + # + # mult_idx_2: + # 0: 1 3 2 2 + # 1: 2 3 2 2 + mult_idx_1 = pd.MultiIndex.from_product([[1, 2]] * 4) + mult_idx_2 = pd.MultiIndex.from_tuples([(1, 3, 2, 2), (2, 3, 2, 2)]) + + # show the tuple orderings, which get_indexer() should respect + assert mult_idx_1[7] < mult_idx_2[0] < mult_idx_1[8] + assert mult_idx_1[-1] < mult_idx_2[1] + + indexer = mult_idx_1.get_indexer(mult_idx_2) + expected = np.array([-1, -1], dtype=indexer.dtype) + tm.assert_almost_equal(expected, indexer) + + backfill_indexer = mult_idx_1.get_indexer(mult_idx_2, method="bfill") + expected = np.array([8, -1], dtype=backfill_indexer.dtype) + tm.assert_almost_equal(expected, backfill_indexer) + + pad_indexer = mult_idx_1.get_indexer(mult_idx_2, method="ffill") + expected = np.array([7, 15], dtype=pad_indexer.dtype) + tm.assert_almost_equal(expected, pad_indexer) + def test_getitem(idx): # scalar @@ -441,6 +638,99 @@ def test_get_loc_with_values_including_missing_values(self): expected = slice(2, 4, None) assert idx.get_loc((np.nan, 1)) == expected + def test_get_loc_duplicates2(self): + # TODO: de-duplicate with test_get_loc_duplicates above? + index = MultiIndex( + levels=[["D", "B", "C"], [0, 26, 27, 37, 57, 67, 75, 82]], + codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], + names=["tag", "day"], + ) + + assert index.get_loc("D") == slice(0, 3) + + +class TestWhere: + def test_where(self): + i = MultiIndex.from_tuples([("A", 1), ("A", 2)]) + + msg = r"\.where is not supported for MultiIndex operations" + with pytest.raises(NotImplementedError, match=msg): + i.where(True) + + @pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series]) + def test_where_array_like(self, klass): + i = MultiIndex.from_tuples([("A", 1), ("A", 2)]) + cond = [False, True] + msg = r"\.where is not supported for MultiIndex operations" + with pytest.raises(NotImplementedError, match=msg): + i.where(klass(cond)) + + +class TestContains: + def test_contains_top_level(self): + midx = MultiIndex.from_product([["A", "B"], [1, 2]]) + assert "A" in midx + assert "A" not in midx._engine + + def test_contains_with_nat(self): + # MI with a NaT + mi = MultiIndex( + levels=[["C"], pd.date_range("2012-01-01", periods=5)], + codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], + names=[None, "B"], + ) + assert ("C", pd.Timestamp("2012-01-01")) in mi + for val in mi.values: + assert val in mi + + def test_contains(self, idx): + assert ("foo", "two") in idx + assert ("bar", "two") not in idx + assert None not in idx + + def test_contains_with_missing_value(self): + # GH#19132 + idx = MultiIndex.from_arrays([[1, np.nan, 2]]) + assert np.nan in idx + + idx = MultiIndex.from_arrays([[1, 2], [np.nan, 3]]) + assert np.nan not in idx + assert (1, np.nan) in idx + + def test_multiindex_contains_dropped(self): + # GH#19027 + # test that dropped MultiIndex levels are not in the MultiIndex + # despite continuing to be in the MultiIndex's levels + idx = MultiIndex.from_product([[1, 2], [3, 4]]) + assert 2 in idx + idx = idx.drop(2) + + # drop implementation keeps 2 in the levels + assert 2 in idx.levels[0] + # but it should no longer be in the index itself + assert 2 not in idx + + # also applies to strings + idx = MultiIndex.from_product([["a", "b"], ["c", "d"]]) + assert "a" in idx + idx = idx.drop("a") + assert "a" in idx.levels[0] + assert "a" not in idx + + def test_contains_td64_level(self): + # GH#24570 + tx = pd.timedelta_range("09:30:00", "16:00:00", freq="30 min") + idx = MultiIndex.from_arrays([tx, np.arange(len(tx))]) + assert tx[0] in idx + assert "element_not_exit" not in idx + assert "0 day 09:30:00" in idx + + @pytest.mark.slow + def test_large_mi_contains(self): + # GH#10645 + result = MultiIndex.from_arrays([range(10 ** 6), range(10 ** 6)]) + assert not (10 ** 6, 0) in result + def test_timestamp_multiindex_indexer(): # https://github.com/pandas-dev/pandas/issues/26944 @@ -498,3 +788,41 @@ def test_slice_indexer_with_missing_value(index_arr, expected, start_idx, end_id idx = MultiIndex.from_arrays(index_arr) result = idx.slice_indexer(start=start_idx, end=end_idx) assert result == expected + + +def test_pyint_engine(): + # GH#18519 : when combinations of codes cannot be represented in 64 + # bits, the index underlying the MultiIndex engine works with Python + # integers, rather than uint64. + N = 5 + keys = [ + tuple(l) + for l in [ + [0] * 10 * N, + [1] * 10 * N, + [2] * 10 * N, + [np.nan] * N + [2] * 9 * N, + [0] * N + [2] * 9 * N, + [np.nan] * N + [2] * 8 * N + [0] * N, + ] + ] + # Each level contains 4 elements (including NaN), so it is represented + # in 2 bits, for a total of 2*N*10 = 100 > 64 bits. If we were using a + # 64 bit engine and truncating the first levels, the fourth and fifth + # keys would collide; if truncating the last levels, the fifth and + # sixth; if rotating bits rather than shifting, the third and fifth. + + for idx in range(len(keys)): + index = MultiIndex.from_tuples(keys) + assert index.get_loc(keys[idx]) == idx + + expected = np.arange(idx + 1, dtype=np.intp) + result = index.get_indexer([keys[i] for i in expected]) + tm.assert_numpy_array_equal(result, expected) + + # With missing key: + idces = range(len(keys)) + expected = np.array([-1] + list(idces), dtype=np.intp) + missing = tuple([0, 1] * 5 * N) + result = index.get_indexer([missing] + [keys[i] for i in idces]) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_contains.py b/pandas/tests/indexes/multi/test_isin.py similarity index 78% rename from pandas/tests/indexes/multi/test_contains.py rename to pandas/tests/indexes/multi/test_isin.py index 49aa63210cd5e..122263e6ec198 100644 --- a/pandas/tests/indexes/multi/test_contains.py +++ b/pandas/tests/indexes/multi/test_isin.py @@ -3,35 +3,10 @@ from pandas.compat import PYPY -import pandas as pd from pandas import MultiIndex import pandas._testing as tm -def test_contains_top_level(): - midx = MultiIndex.from_product([["A", "B"], [1, 2]]) - assert "A" in midx - assert "A" not in midx._engine - - -def test_contains_with_nat(): - # MI with a NaT - mi = MultiIndex( - levels=[["C"], pd.date_range("2012-01-01", periods=5)], - codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], - names=[None, "B"], - ) - assert ("C", pd.Timestamp("2012-01-01")) in mi - for val in mi.values: - assert val in mi - - -def test_contains(idx): - assert ("foo", "two") in idx - assert ("bar", "two") not in idx - assert None not in idx - - @pytest.mark.skipif(not PYPY, reason="tuples cmp recursively on PyPy") def test_isin_nan_pypy(): idx = MultiIndex.from_arrays([["foo", "bar"], [1.0, np.nan]]) @@ -100,16 +75,6 @@ def test_isin_level_kwarg(): idx.isin(vals_1, level="C") -def test_contains_with_missing_value(): - # issue 19132 - idx = MultiIndex.from_arrays([[1, np.nan, 2]]) - assert np.nan in idx - - idx = MultiIndex.from_arrays([[1, 2], [np.nan, 3]]) - assert np.nan not in idx - assert (1, np.nan) in idx - - @pytest.mark.parametrize( "labels,expected,level", [ diff --git a/pandas/tests/indexes/multi/test_lexsort.py b/pandas/tests/indexes/multi/test_lexsort.py new file mode 100644 index 0000000000000..1d2ad8e02697e --- /dev/null +++ b/pandas/tests/indexes/multi/test_lexsort.py @@ -0,0 +1,46 @@ +from pandas import MultiIndex + + +class TestIsLexsorted: + def test_is_lexsorted(self): + levels = [[0, 1], [0, 1, 2]] + + index = MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] + ) + assert index.is_lexsorted() + + index = MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]] + ) + assert not index.is_lexsorted() + + index = MultiIndex( + levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]] + ) + assert not index.is_lexsorted() + assert index.lexsort_depth == 0 + + +class TestLexsortDepth: + def test_lexsort_depth(self): + # Test that lexsort_depth return the correct sortorder + # when it was given to the MultiIndex const. + # GH#28518 + + levels = [[0, 1], [0, 1, 2]] + + index = MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], sortorder=2 + ) + assert index.lexsort_depth == 2 + + index = MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], sortorder=1 + ) + assert index.lexsort_depth == 1 + + index = MultiIndex( + levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], sortorder=0 + ) + assert index.lexsort_depth == 0 diff --git a/pandas/tests/indexes/multi/test_missing.py b/pandas/tests/indexes/multi/test_missing.py index a17e1e9928bff..4c9d518778ceb 100644 --- a/pandas/tests/indexes/multi/test_missing.py +++ b/pandas/tests/indexes/multi/test_missing.py @@ -1,55 +1,16 @@ import numpy as np import pytest -from pandas._libs.tslib import iNaT - import pandas as pd -from pandas import Int64Index, MultiIndex, PeriodIndex, UInt64Index +from pandas import MultiIndex import pandas._testing as tm -from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin def test_fillna(idx): # GH 11343 - - # TODO: Remove or Refactor. Not Implemented for MultiIndex - for name, index in [("idx", idx)]: - if len(index) == 0: - pass - elif isinstance(index, MultiIndex): - idx = index.copy() - msg = "isna is not defined for MultiIndex" - with pytest.raises(NotImplementedError, match=msg): - idx.fillna(idx[0]) - else: - idx = index.copy() - result = idx.fillna(idx[0]) - tm.assert_index_equal(result, idx) - assert result is not idx - - msg = "'value' must be a scalar, passed: " - with pytest.raises(TypeError, match=msg): - idx.fillna([idx[0]]) - - idx = index.copy() - values = idx.values - - if isinstance(index, DatetimeIndexOpsMixin): - values[1] = iNaT - elif isinstance(index, (Int64Index, UInt64Index)): - continue - else: - values[1] = np.nan - - if isinstance(index, PeriodIndex): - idx = type(index)(values, freq=index.freq) - else: - idx = type(index)(values) - - expected = np.array([False] * len(idx), dtype=bool) - expected[1] = True - tm.assert_numpy_array_equal(idx._isnan, expected) - assert idx.hasnans is True + msg = "isna is not defined for MultiIndex" + with pytest.raises(NotImplementedError, match=msg): + idx.fillna(idx[0]) def test_dropna(): @@ -141,3 +102,13 @@ def test_nan_stays_float(): assert pd.isna(df0.index.get_level_values(1)).all() # the following failed in 0.14.1 assert pd.isna(dfm.index.get_level_values(1)[:-1]).all() + + +def test_tuples_have_na(): + index = MultiIndex( + levels=[[1, 0], [0, 1, 2, 3]], + codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]], + ) + + assert pd.isna(index[4][0]) + assert pd.isna(index.values[4][0]) diff --git a/pandas/tests/indexes/multi/test_reshape.py b/pandas/tests/indexes/multi/test_reshape.py index 2e39c714ca7af..de32bd94be491 100644 --- a/pandas/tests/indexes/multi/test_reshape.py +++ b/pandas/tests/indexes/multi/test_reshape.py @@ -1,5 +1,8 @@ +from datetime import datetime + import numpy as np import pytest +import pytz import pandas as pd from pandas import Index, MultiIndex @@ -95,6 +98,53 @@ def test_append(idx): assert result.equals(idx) +def test_append_index(): + idx1 = Index([1.1, 1.2, 1.3]) + idx2 = pd.date_range("2011-01-01", freq="D", periods=3, tz="Asia/Tokyo") + idx3 = Index(["A", "B", "C"]) + + midx_lv2 = MultiIndex.from_arrays([idx1, idx2]) + midx_lv3 = MultiIndex.from_arrays([idx1, idx2, idx3]) + + result = idx1.append(midx_lv2) + + # see gh-7112 + tz = pytz.timezone("Asia/Tokyo") + expected_tuples = [ + (1.1, tz.localize(datetime(2011, 1, 1))), + (1.2, tz.localize(datetime(2011, 1, 2))), + (1.3, tz.localize(datetime(2011, 1, 3))), + ] + expected = Index([1.1, 1.2, 1.3] + expected_tuples) + tm.assert_index_equal(result, expected) + + result = midx_lv2.append(idx1) + expected = Index(expected_tuples + [1.1, 1.2, 1.3]) + tm.assert_index_equal(result, expected) + + result = midx_lv2.append(midx_lv2) + expected = MultiIndex.from_arrays([idx1.append(idx1), idx2.append(idx2)]) + tm.assert_index_equal(result, expected) + + result = midx_lv2.append(midx_lv3) + tm.assert_index_equal(result, expected) + + result = midx_lv3.append(midx_lv2) + expected = Index._simple_new( + np.array( + [ + (1.1, tz.localize(datetime(2011, 1, 1)), "A"), + (1.2, tz.localize(datetime(2011, 1, 2)), "B"), + (1.3, tz.localize(datetime(2011, 1, 3)), "C"), + ] + + expected_tuples, + dtype=object, + ), + None, + ) + tm.assert_index_equal(result, expected) + + def test_repeat(): reps = 2 numbers = [1, 2, 3] diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 627127f7b5b53..d7427ee622977 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -7,7 +7,6 @@ @pytest.mark.parametrize("case", [0.5, "xxx"]) -@pytest.mark.parametrize("sort", [None, False]) @pytest.mark.parametrize( "method", ["intersection", "union", "difference", "symmetric_difference"] ) @@ -18,7 +17,6 @@ def test_set_ops_error_cases(idx, case, sort, method): getattr(idx, method)(case, sort=sort) -@pytest.mark.parametrize("sort", [None, False]) @pytest.mark.parametrize("klass", [MultiIndex, np.array, Series, list]) def test_intersection_base(idx, sort, klass): first = idx[2::-1] # first 3 elements reversed @@ -39,7 +37,6 @@ def test_intersection_base(idx, sort, klass): first.intersection([1, 2, 3], sort=sort) -@pytest.mark.parametrize("sort", [None, False]) @pytest.mark.parametrize("klass", [MultiIndex, np.array, Series, list]) def test_union_base(idx, sort, klass): first = idx[::-1] @@ -60,7 +57,6 @@ def test_union_base(idx, sort, klass): first.union([1, 2, 3], sort=sort) -@pytest.mark.parametrize("sort", [None, False]) def test_difference_base(idx, sort): second = idx[4:] answer = idx[:4] @@ -83,7 +79,6 @@ def test_difference_base(idx, sort): idx.difference([1, 2, 3], sort=sort) -@pytest.mark.parametrize("sort", [None, False]) def test_symmetric_difference(idx, sort): first = idx[1:] second = idx[:-1] @@ -123,7 +118,6 @@ def test_empty(idx): assert idx[:0].empty -@pytest.mark.parametrize("sort", [None, False]) def test_difference(idx, sort): first = idx @@ -215,7 +209,8 @@ def test_difference_sort_incomparable(): # sort=None, the default # MultiIndex.difference deviates here from other difference # implementations in not catching the TypeError - with pytest.raises(TypeError): + msg = "'<' not supported between instances of 'Timestamp' and 'int'" + with pytest.raises(TypeError, match=msg): result = idx.difference(other) # sort=False @@ -234,7 +229,6 @@ def test_difference_sort_incomparable_true(): idx.difference(other, sort=True) -@pytest.mark.parametrize("sort", [None, False]) def test_union(idx, sort): piece1 = idx[:5][::-1] piece2 = idx[3:] @@ -253,6 +247,7 @@ def test_union(idx, sort): the_union = idx.union(idx[:0], sort=sort) assert the_union is idx + # FIXME: dont leave commented-out # won't work in python 3 # tuples = _index.values # result = _index[:4] | tuples[4:] @@ -270,7 +265,6 @@ def test_union(idx, sort): # assert result.equals(result2) -@pytest.mark.parametrize("sort", [None, False]) def test_intersection(idx, sort): piece1 = idx[:5][::-1] piece2 = idx[3:] @@ -290,6 +284,7 @@ def test_intersection(idx, sort): expected = idx[:0] assert empty.equals(expected) + # FIXME: dont leave commented-out # can't do in python 3 # tuples = _index.values # result = _index & tuples @@ -342,7 +337,8 @@ def test_union_sort_other_incomparable(): idx = pd.MultiIndex.from_product([[1, pd.Timestamp("2000")], ["a", "b"]]) # default, sort=None - result = idx.union(idx[:1]) + with tm.assert_produces_warning(RuntimeWarning): + result = idx.union(idx[:1]) tm.assert_index_equal(result, idx) # sort=False @@ -359,6 +355,17 @@ def test_union_sort_other_incomparable_sort(): idx.union(idx[:1], sort=True) +def test_union_non_object_dtype_raises(): + # GH#32646 raise NotImplementedError instead of less-informative error + mi = pd.MultiIndex.from_product([["a", "b"], [1, 2]]) + + idx = mi.levels[1] + + msg = "Can only union MultiIndex with MultiIndex or Index of tuples" + with pytest.raises(NotImplementedError, match=msg): + mi.union(idx) + + @pytest.mark.parametrize( "method", ["union", "intersection", "difference", "symmetric_difference"] ) diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py new file mode 100644 index 0000000000000..473e370c76f8b --- /dev/null +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -0,0 +1,237 @@ +import numpy as np +import pytest + +from pandas import Float64Index, Int64Index, Series, UInt64Index +import pandas._testing as tm + + +@pytest.fixture +def index_large(): + # large values used in UInt64Index tests where no compat needed with Int64/Float64 + large = [2 ** 63, 2 ** 63 + 10, 2 ** 63 + 15, 2 ** 63 + 20, 2 ** 63 + 25] + return UInt64Index(large) + + +class TestGetLoc: + def test_get_loc_float64(self): + idx = Float64Index([0.0, 1.0, 2.0]) + for method in [None, "pad", "backfill", "nearest"]: + assert idx.get_loc(1, method) == 1 + if method is not None: + assert idx.get_loc(1, method, tolerance=0) == 1 + + for method, loc in [("pad", 1), ("backfill", 2), ("nearest", 1)]: + assert idx.get_loc(1.1, method) == loc + assert idx.get_loc(1.1, method, tolerance=0.9) == loc + + with pytest.raises(KeyError, match="^'foo'$"): + idx.get_loc("foo") + with pytest.raises(KeyError, match=r"^1\.5$"): + idx.get_loc(1.5) + with pytest.raises(KeyError, match=r"^1\.5$"): + idx.get_loc(1.5, method="pad", tolerance=0.1) + with pytest.raises(KeyError, match="^True$"): + idx.get_loc(True) + with pytest.raises(KeyError, match="^False$"): + idx.get_loc(False) + + with pytest.raises(ValueError, match="must be numeric"): + idx.get_loc(1.4, method="nearest", tolerance="foo") + + with pytest.raises(ValueError, match="must contain numeric elements"): + idx.get_loc(1.4, method="nearest", tolerance=np.array(["foo"])) + + with pytest.raises( + ValueError, match="tolerance size must match target index size" + ): + idx.get_loc(1.4, method="nearest", tolerance=np.array([1, 2])) + + def test_get_loc_na(self): + idx = Float64Index([np.nan, 1, 2]) + assert idx.get_loc(1) == 1 + assert idx.get_loc(np.nan) == 0 + + idx = Float64Index([np.nan, 1, np.nan]) + assert idx.get_loc(1) == 1 + + # FIXME: dont leave commented-out + # representable by slice [0:2:2] + # pytest.raises(KeyError, idx.slice_locs, np.nan) + sliced = idx.slice_locs(np.nan) + assert isinstance(sliced, tuple) + assert sliced == (0, 3) + + # not representable by slice + idx = Float64Index([np.nan, 1, np.nan, np.nan]) + assert idx.get_loc(1) == 1 + msg = "'Cannot get left slice bound for non-unique label: nan" + with pytest.raises(KeyError, match=msg): + idx.slice_locs(np.nan) + + def test_get_loc_missing_nan(self): + # GH#8569 + idx = Float64Index([1, 2]) + assert idx.get_loc(1) == 0 + with pytest.raises(KeyError, match=r"^3$"): + idx.get_loc(3) + with pytest.raises(KeyError, match="^nan$"): + idx.get_loc(np.nan) + with pytest.raises(TypeError, match=r"'\[nan\]' is an invalid key"): + # listlike/non-hashable raises TypeError + idx.get_loc([np.nan]) + + +class TestGetIndexer: + def test_get_indexer_float64(self): + idx = Float64Index([0.0, 1.0, 2.0]) + tm.assert_numpy_array_equal( + idx.get_indexer(idx), np.array([0, 1, 2], dtype=np.intp) + ) + + target = [-0.1, 0.5, 1.1] + tm.assert_numpy_array_equal( + idx.get_indexer(target, "pad"), np.array([-1, 0, 1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "backfill"), np.array([0, 1, 2], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest"), np.array([0, 1, 1], dtype=np.intp) + ) + + def test_get_indexer_nan(self): + # GH#7820 + result = Float64Index([1, 2, np.nan]).get_indexer([np.nan]) + expected = np.array([2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + def test_get_indexer_int64(self): + index = Int64Index(range(0, 20, 2)) + target = Int64Index(np.arange(10)) + indexer = index.get_indexer(target) + expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + target = Int64Index(np.arange(10)) + indexer = index.get_indexer(target, method="pad") + expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + target = Int64Index(np.arange(10)) + indexer = index.get_indexer(target, method="backfill") + expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + def test_get_indexer_uint64(self, index_large): + target = UInt64Index(np.arange(10).astype("uint64") * 5 + 2 ** 63) + indexer = index_large.get_indexer(target) + expected = np.array([0, -1, 1, 2, 3, 4, -1, -1, -1, -1], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + target = UInt64Index(np.arange(10).astype("uint64") * 5 + 2 ** 63) + indexer = index_large.get_indexer(target, method="pad") + expected = np.array([0, 0, 1, 2, 3, 4, 4, 4, 4, 4], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + target = UInt64Index(np.arange(10).astype("uint64") * 5 + 2 ** 63) + indexer = index_large.get_indexer(target, method="backfill") + expected = np.array([0, 1, 1, 2, 3, 4, -1, -1, -1, -1], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + +class TestWhere: + @pytest.mark.parametrize( + "index", + [ + Float64Index(np.arange(5, dtype="float64")), + Int64Index(range(0, 20, 2)), + UInt64Index(np.arange(5, dtype="uint64")), + ], + ) + @pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) + def test_where(self, klass, index): + cond = [True] * len(index) + expected = index + result = index.where(klass(cond)) + + cond = [False] + [True] * (len(index) - 1) + expected = Float64Index([index._na_value] + index[1:].tolist()) + result = index.where(klass(cond)) + tm.assert_index_equal(result, expected) + + +class TestTake: + @pytest.mark.parametrize("klass", [Float64Index, Int64Index, UInt64Index]) + def test_take_preserve_name(self, klass): + index = klass([1, 2, 3, 4], name="foo") + taken = index.take([3, 0, 1]) + assert index.name == taken.name + + def test_take_fill_value_float64(self): + # GH 12631 + idx = Float64Index([1.0, 2.0, 3.0], name="xxx") + result = idx.take(np.array([1, 0, -1])) + expected = Float64Index([2.0, 1.0, 3.0], name="xxx") + tm.assert_index_equal(result, expected) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = Float64Index([2.0, 1.0, np.nan], name="xxx") + tm.assert_index_equal(result, expected) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = Float64Index([2.0, 1.0, 3.0], name="xxx") + tm.assert_index_equal(result, expected) + + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + msg = "index -5 is out of bounds for (axis 0 with )?size 3" + with pytest.raises(IndexError, match=msg): + idx.take(np.array([1, -5])) + + @pytest.mark.parametrize("klass", [Int64Index, UInt64Index]) + def test_take_fill_value_ints(self, klass): + # see gh-12631 + idx = klass([1, 2, 3], name="xxx") + result = idx.take(np.array([1, 0, -1])) + expected = klass([2, 1, 3], name="xxx") + tm.assert_index_equal(result, expected) + + name = klass.__name__ + msg = f"Unable to fill values because {name} cannot contain NA" + + # fill_value=True + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -1]), fill_value=True) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = klass([2, 1, 3], name="xxx") + tm.assert_index_equal(result, expected) + + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + msg = "index -5 is out of bounds for (axis 0 with )?size 3" + with pytest.raises(IndexError, match=msg): + idx.take(np.array([1, -5])) + + +class TestContains: + def test_contains_float64_nans(self): + index = Float64Index([1.0, 2.0, np.nan]) + assert np.nan in index + + def test_contains_float64_not_nans(self): + index = Float64Index([1.0, 2.0, np.nan]) + assert 1.0 in index diff --git a/pandas/tests/indexes/period/test_astype.py b/pandas/tests/indexes/period/test_astype.py index 2f10e45193d5d..b286191623ebb 100644 --- a/pandas/tests/indexes/period/test_astype.py +++ b/pandas/tests/indexes/period/test_astype.py @@ -27,31 +27,34 @@ def test_astype_raises(self, dtype): def test_astype_conversion(self): # GH#13149, GH#13209 - idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.NaN], freq="D") + idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.NaN], freq="D", name="idx") result = idx.astype(object) expected = Index( [Period("2016-05-16", freq="D")] + [Period(NaT, freq="D")] * 3, dtype="object", + name="idx", ) tm.assert_index_equal(result, expected) result = idx.astype(np.int64) - expected = Int64Index([16937] + [-9223372036854775808] * 3, dtype=np.int64) + expected = Int64Index( + [16937] + [-9223372036854775808] * 3, dtype=np.int64, name="idx" + ) tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index(str(x) for x in idx) + expected = Index([str(x) for x in idx], name="idx") tm.assert_index_equal(result, expected) - idx = period_range("1990", "2009", freq="A") + idx = period_range("1990", "2009", freq="A", name="idx") result = idx.astype("i8") - tm.assert_index_equal(result, Index(idx.asi8)) + tm.assert_index_equal(result, Index(idx.asi8, name="idx")) tm.assert_numpy_array_equal(result.values, idx.asi8) def test_astype_uint(self): - arr = period_range("2000", periods=2) - expected = UInt64Index(np.array([10957, 10958], dtype="uint64")) + arr = period_range("2000", periods=2, name="idx") + expected = UInt64Index(np.array([10957, 10958], dtype="uint64"), name="idx") tm.assert_index_equal(arr.astype("uint64"), expected) tm.assert_index_equal(arr.astype("uint32"), expected) @@ -116,10 +119,10 @@ def test_astype_object2(self): assert result_list[2] is NaT def test_astype_category(self): - obj = period_range("2000", periods=2) + obj = period_range("2000", periods=2, name="idx") result = obj.astype("category") expected = CategoricalIndex( - [Period("2000-01-01", freq="D"), Period("2000-01-02", freq="D")] + [Period("2000-01-01", freq="D"), Period("2000-01-02", freq="D")], name="idx" ) tm.assert_index_equal(result, expected) @@ -128,9 +131,9 @@ def test_astype_category(self): tm.assert_categorical_equal(result, expected) def test_astype_array_fallback(self): - obj = period_range("2000", periods=2) + obj = period_range("2000", periods=2, name="idx") result = obj.astype(bool) - expected = Index(np.array([True, True])) + expected = Index(np.array([True, True]), name="idx") tm.assert_index_equal(result, expected) result = obj._data.astype(bool) diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index b5ff83ec7514d..4ec7ef64e2272 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -147,9 +147,9 @@ def test_constructor_fromarraylike(self): msg = "freq not specified and cannot be inferred" with pytest.raises(ValueError, match=msg): - PeriodIndex(idx._ndarray_values) + PeriodIndex(idx.asi8) with pytest.raises(ValueError, match=msg): - PeriodIndex(list(idx._ndarray_values)) + PeriodIndex(list(idx.asi8)) msg = "'Period' object is not iterable" with pytest.raises(TypeError, match=msg): @@ -327,7 +327,8 @@ def test_constructor_simple_new(self): result = idx._simple_new(idx._data, name="p") tm.assert_index_equal(result, idx) - with pytest.raises(AssertionError): + msg = "Should be numpy array of type i8" + with pytest.raises(AssertionError, match=msg): # Need ndarray, not Int64Index type(idx._data)._simple_new(idx.astype("i8"), freq=idx.freq) diff --git a/pandas/tests/indexes/period/test_fillna.py b/pandas/tests/indexes/period/test_fillna.py new file mode 100644 index 0000000000000..602e87333a6c1 --- /dev/null +++ b/pandas/tests/indexes/period/test_fillna.py @@ -0,0 +1,36 @@ +from pandas import Index, NaT, Period, PeriodIndex +import pandas._testing as tm + + +class TestFillNA: + def test_fillna_period(self): + # GH#11343 + idx = PeriodIndex(["2011-01-01 09:00", NaT, "2011-01-01 11:00"], freq="H") + + exp = PeriodIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], freq="H" + ) + result = idx.fillna(Period("2011-01-01 10:00", freq="H")) + tm.assert_index_equal(result, exp) + + exp = Index( + [ + Period("2011-01-01 09:00", freq="H"), + "x", + Period("2011-01-01 11:00", freq="H"), + ], + dtype=object, + ) + result = idx.fillna("x") + tm.assert_index_equal(result, exp) + + exp = Index( + [ + Period("2011-01-01 09:00", freq="H"), + Period("2011-01-01", freq="D"), + Period("2011-01-01 11:00", freq="H"), + ], + dtype=object, + ) + result = idx.fillna(Period("2011-01-01", freq="D")) + tm.assert_index_equal(result, exp) diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 077fa2a0b1c56..c4aaf6332ba15 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -31,6 +31,10 @@ def test_ellipsis(self): assert result.equals(idx) assert result is not idx + def test_getitem_slice_keeps_name(self): + idx = period_range("20010101", periods=10, freq="D", name="bob") + assert idx.name == idx[1:].name + def test_getitem(self): idx1 = period_range("2011-01-01", "2011-01-31", freq="D", name="idx") @@ -216,165 +220,7 @@ def test_getitem_day(self): s[v] -class TestWhere: - @pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) - def test_where(self, klass): - i = period_range("20130101", periods=5, freq="D") - cond = [True] * len(i) - expected = i - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) - - cond = [False] + [True] * (len(i) - 1) - expected = PeriodIndex([NaT] + i[1:].tolist(), freq="D") - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) - - def test_where_other(self): - i = period_range("20130101", periods=5, freq="D") - for arr in [np.nan, NaT]: - result = i.where(notna(i), other=np.nan) - expected = i - tm.assert_index_equal(result, expected) - - i2 = i.copy() - i2 = PeriodIndex([NaT, NaT] + i[2:].tolist(), freq="D") - result = i.where(notna(i2), i2) - tm.assert_index_equal(result, i2) - - i2 = i.copy() - i2 = PeriodIndex([NaT, NaT] + i[2:].tolist(), freq="D") - result = i.where(notna(i2), i2.values) - tm.assert_index_equal(result, i2) - - def test_where_invalid_dtypes(self): - pi = period_range("20130101", periods=5, freq="D") - - i2 = pi.copy() - i2 = PeriodIndex([NaT, NaT] + pi[2:].tolist(), freq="D") - - with pytest.raises(TypeError, match="Where requires matching dtype"): - pi.where(notna(i2), i2.asi8) - - with pytest.raises(TypeError, match="Where requires matching dtype"): - pi.where(notna(i2), i2.asi8.view("timedelta64[ns]")) - - with pytest.raises(TypeError, match="Where requires matching dtype"): - pi.where(notna(i2), i2.to_timestamp("S")) - - -class TestTake: - def test_take(self): - # GH#10295 - idx1 = period_range("2011-01-01", "2011-01-31", freq="D", name="idx") - - for idx in [idx1]: - result = idx.take([0]) - assert result == Period("2011-01-01", freq="D") - - result = idx.take([5]) - assert result == Period("2011-01-06", freq="D") - - result = idx.take([0, 1, 2]) - expected = period_range("2011-01-01", "2011-01-03", freq="D", name="idx") - tm.assert_index_equal(result, expected) - assert result.freq == "D" - assert result.freq == expected.freq - - result = idx.take([0, 2, 4]) - expected = PeriodIndex( - ["2011-01-01", "2011-01-03", "2011-01-05"], freq="D", name="idx" - ) - tm.assert_index_equal(result, expected) - assert result.freq == expected.freq - assert result.freq == "D" - - result = idx.take([7, 4, 1]) - expected = PeriodIndex( - ["2011-01-08", "2011-01-05", "2011-01-02"], freq="D", name="idx" - ) - tm.assert_index_equal(result, expected) - assert result.freq == expected.freq - assert result.freq == "D" - - result = idx.take([3, 2, 5]) - expected = PeriodIndex( - ["2011-01-04", "2011-01-03", "2011-01-06"], freq="D", name="idx" - ) - tm.assert_index_equal(result, expected) - assert result.freq == expected.freq - assert result.freq == "D" - - result = idx.take([-3, 2, 5]) - expected = PeriodIndex( - ["2011-01-29", "2011-01-03", "2011-01-06"], freq="D", name="idx" - ) - tm.assert_index_equal(result, expected) - assert result.freq == expected.freq - assert result.freq == "D" - - def test_take_misc(self): - index = period_range(start="1/1/10", end="12/31/12", freq="D", name="idx") - expected = PeriodIndex( - [ - datetime(2010, 1, 6), - datetime(2010, 1, 7), - datetime(2010, 1, 9), - datetime(2010, 1, 13), - ], - freq="D", - name="idx", - ) - - taken1 = index.take([5, 6, 8, 12]) - taken2 = index[[5, 6, 8, 12]] - - for taken in [taken1, taken2]: - tm.assert_index_equal(taken, expected) - assert isinstance(taken, PeriodIndex) - assert taken.freq == index.freq - assert taken.name == expected.name - - def test_take_fill_value(self): - # GH#12631 - idx = PeriodIndex( - ["2011-01-01", "2011-02-01", "2011-03-01"], name="xxx", freq="D" - ) - result = idx.take(np.array([1, 0, -1])) - expected = PeriodIndex( - ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx", freq="D" - ) - tm.assert_index_equal(result, expected) - - # fill_value - result = idx.take(np.array([1, 0, -1]), fill_value=True) - expected = PeriodIndex( - ["2011-02-01", "2011-01-01", "NaT"], name="xxx", freq="D" - ) - tm.assert_index_equal(result, expected) - - # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) - expected = PeriodIndex( - ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx", freq="D" - ) - tm.assert_index_equal(result, expected) - - msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" - ) - with pytest.raises(ValueError, match=msg): - idx.take(np.array([1, 0, -2]), fill_value=True) - with pytest.raises(ValueError, match=msg): - idx.take(np.array([1, 0, -5]), fill_value=True) - - msg = "index -5 is out of bounds for( axis 0 with)? size 3" - with pytest.raises(IndexError, match=msg): - idx.take(np.array([1, -5])) - - -class TestIndexing: +class TestGetLoc: def test_get_loc_msg(self): idx = period_range("2000-1-1", freq="A", periods=10) bad_period = Period("2012", "A") @@ -465,153 +311,68 @@ def test_get_loc_integer(self): with pytest.raises(KeyError, match="46"): pi2.get_loc(46) - @pytest.mark.parametrize("freq", ["H", "D"]) - def test_get_value_datetime_hourly(self, freq): - # get_loc and get_value should treat datetime objects symmetrically - dti = date_range("2016-01-01", periods=3, freq="MS") - pi = dti.to_period(freq) - ser = pd.Series(range(7, 10), index=pi) + # TODO: This method came from test_period; de-dup with version above + def test_get_loc2(self): + idx = period_range("2000-01-01", periods=3) - ts = dti[0] + for method in [None, "pad", "backfill", "nearest"]: + assert idx.get_loc(idx[1], method) == 1 + assert idx.get_loc(idx[1].asfreq("H", how="start"), method) == 1 + assert idx.get_loc(idx[1].to_timestamp(), method) == 1 + assert idx.get_loc(idx[1].to_timestamp().to_pydatetime(), method) == 1 + assert idx.get_loc(str(idx[1]), method) == 1 - assert pi.get_loc(ts) == 0 - assert pi.get_value(ser, ts) == 7 - assert ser[ts] == 7 - assert ser.loc[ts] == 7 + idx = period_range("2000-01-01", periods=5)[::2] + assert idx.get_loc("2000-01-02T12", method="nearest", tolerance="1 day") == 1 + assert ( + idx.get_loc("2000-01-02T12", method="nearest", tolerance=Timedelta("1D")) + == 1 + ) + assert ( + idx.get_loc( + "2000-01-02T12", method="nearest", tolerance=np.timedelta64(1, "D") + ) + == 1 + ) + assert ( + idx.get_loc("2000-01-02T12", method="nearest", tolerance=timedelta(1)) == 1 + ) - ts2 = ts + Timedelta(hours=3) - if freq == "H": - with pytest.raises(KeyError, match="2016-01-01 03:00"): - pi.get_loc(ts2) - with pytest.raises(KeyError, match="2016-01-01 03:00"): - pi.get_value(ser, ts2) - with pytest.raises(KeyError, match="2016-01-01 03:00"): - ser[ts2] - with pytest.raises(KeyError, match="2016-01-01 03:00"): - ser.loc[ts2] - else: - assert pi.get_loc(ts2) == 0 - assert pi.get_value(ser, ts2) == 7 - assert ser[ts2] == 7 - assert ser.loc[ts2] == 7 + msg = "unit abbreviation w/o a number" + with pytest.raises(ValueError, match=msg): + idx.get_loc("2000-01-10", method="nearest", tolerance="foo") - def test_get_value_integer(self): - msg = "index 16801 is out of bounds for axis 0 with size 3" - dti = date_range("2016-01-01", periods=3) - pi = dti.to_period("D") - ser = pd.Series(range(3), index=pi) - with pytest.raises(IndexError, match=msg): - pi.get_value(ser, 16801) + msg = "Input has different freq=None from PeriodArray\\(freq=D\\)" + with pytest.raises(ValueError, match=msg): + idx.get_loc("2000-01-10", method="nearest", tolerance="1 hour") + with pytest.raises(KeyError, match=r"^Period\('2000-01-10', 'D'\)$"): + idx.get_loc("2000-01-10", method="nearest", tolerance="1 day") + with pytest.raises( + ValueError, match="list-like tolerance size must match target index size" + ): + idx.get_loc( + "2000-01-10", + method="nearest", + tolerance=[ + Timedelta("1 day").to_timedelta64(), + Timedelta("1 day").to_timedelta64(), + ], + ) - msg = "index 46 is out of bounds for axis 0 with size 3" - pi2 = dti.to_period("Y") # duplicates, ordinals are all 46 - ser2 = pd.Series(range(3), index=pi2) - with pytest.raises(IndexError, match=msg): - pi2.get_value(ser2, 46) - def test_is_monotonic_increasing(self): +class TestGetIndexer: + def test_get_indexer(self): # GH 17717 - p0 = Period("2017-09-01") - p1 = Period("2017-09-02") - p2 = Period("2017-09-03") + p1 = Period("2017-09-01") + p2 = Period("2017-09-04") + p3 = Period("2017-09-07") - idx_inc0 = PeriodIndex([p0, p1, p2]) - idx_inc1 = PeriodIndex([p0, p1, p1]) - idx_dec0 = PeriodIndex([p2, p1, p0]) - idx_dec1 = PeriodIndex([p2, p1, p1]) - idx = PeriodIndex([p1, p2, p0]) + tp0 = Period("2017-08-31") + tp1 = Period("2017-09-02") + tp2 = Period("2017-09-05") + tp3 = Period("2017-09-09") - assert idx_inc0.is_monotonic_increasing is True - assert idx_inc1.is_monotonic_increasing is True - assert idx_dec0.is_monotonic_increasing is False - assert idx_dec1.is_monotonic_increasing is False - assert idx.is_monotonic_increasing is False - - def test_is_monotonic_decreasing(self): - # GH 17717 - p0 = Period("2017-09-01") - p1 = Period("2017-09-02") - p2 = Period("2017-09-03") - - idx_inc0 = PeriodIndex([p0, p1, p2]) - idx_inc1 = PeriodIndex([p0, p1, p1]) - idx_dec0 = PeriodIndex([p2, p1, p0]) - idx_dec1 = PeriodIndex([p2, p1, p1]) - idx = PeriodIndex([p1, p2, p0]) - - assert idx_inc0.is_monotonic_decreasing is False - assert idx_inc1.is_monotonic_decreasing is False - assert idx_dec0.is_monotonic_decreasing is True - assert idx_dec1.is_monotonic_decreasing is True - assert idx.is_monotonic_decreasing is False - - def test_contains(self): - # GH 17717 - p0 = Period("2017-09-01") - p1 = Period("2017-09-02") - p2 = Period("2017-09-03") - p3 = Period("2017-09-04") - - ps0 = [p0, p1, p2] - idx0 = PeriodIndex(ps0) - ser = pd.Series(range(6, 9), index=idx0) - - for p in ps0: - assert p in idx0 - assert str(p) in idx0 - - # GH#31172 - # Higher-resolution period-like are _not_ considered as contained - key = "2017-09-01 00:00:01" - assert key not in idx0 - with pytest.raises(KeyError, match=key): - idx0.get_loc(key) - with pytest.raises(KeyError, match=key): - idx0.get_value(ser, key) - - assert "2017-09" in idx0 - - assert p3 not in idx0 - - def test_get_value(self): - # GH 17717 - p0 = Period("2017-09-01") - p1 = Period("2017-09-02") - p2 = Period("2017-09-03") - - idx0 = PeriodIndex([p0, p1, p2]) - input0 = pd.Series(np.array([1, 2, 3]), index=idx0) - expected0 = 2 - - result0 = idx0.get_value(input0, p1) - assert result0 == expected0 - - idx1 = PeriodIndex([p1, p1, p2]) - input1 = pd.Series(np.array([1, 2, 3]), index=idx1) - expected1 = input1.iloc[[0, 1]] - - result1 = idx1.get_value(input1, p1) - tm.assert_series_equal(result1, expected1) - - idx2 = PeriodIndex([p1, p2, p1]) - input2 = pd.Series(np.array([1, 2, 3]), index=idx2) - expected2 = input2.iloc[[0, 2]] - - result2 = idx2.get_value(input2, p1) - tm.assert_series_equal(result2, expected2) - - def test_get_indexer(self): - # GH 17717 - p1 = Period("2017-09-01") - p2 = Period("2017-09-04") - p3 = Period("2017-09-07") - - tp0 = Period("2017-08-31") - tp1 = Period("2017-09-02") - tp2 = Period("2017-09-05") - tp3 = Period("2017-09-09") - - idx = PeriodIndex([p1, p2, p3]) + idx = PeriodIndex([p1, p2, p3]) tm.assert_numpy_array_equal( idx.get_indexer(idx), np.array([0, 1, 2], dtype=np.intp) @@ -677,54 +438,6 @@ def test_get_indexer_non_unique(self): tm.assert_numpy_array_equal(result[0], expected_indexer) tm.assert_numpy_array_equal(result[1], expected_missing) - # TODO: This method came from test_period; de-dup with version above - def test_get_loc2(self): - idx = period_range("2000-01-01", periods=3) - - for method in [None, "pad", "backfill", "nearest"]: - assert idx.get_loc(idx[1], method) == 1 - assert idx.get_loc(idx[1].asfreq("H", how="start"), method) == 1 - assert idx.get_loc(idx[1].to_timestamp(), method) == 1 - assert idx.get_loc(idx[1].to_timestamp().to_pydatetime(), method) == 1 - assert idx.get_loc(str(idx[1]), method) == 1 - - idx = period_range("2000-01-01", periods=5)[::2] - assert idx.get_loc("2000-01-02T12", method="nearest", tolerance="1 day") == 1 - assert ( - idx.get_loc("2000-01-02T12", method="nearest", tolerance=Timedelta("1D")) - == 1 - ) - assert ( - idx.get_loc( - "2000-01-02T12", method="nearest", tolerance=np.timedelta64(1, "D") - ) - == 1 - ) - assert ( - idx.get_loc("2000-01-02T12", method="nearest", tolerance=timedelta(1)) == 1 - ) - - msg = "unit abbreviation w/o a number" - with pytest.raises(ValueError, match=msg): - idx.get_loc("2000-01-10", method="nearest", tolerance="foo") - - msg = "Input has different freq=None from PeriodArray\\(freq=D\\)" - with pytest.raises(ValueError, match=msg): - idx.get_loc("2000-01-10", method="nearest", tolerance="1 hour") - with pytest.raises(KeyError, match=r"^Period\('2000-01-10', 'D'\)$"): - idx.get_loc("2000-01-10", method="nearest", tolerance="1 day") - with pytest.raises( - ValueError, match="list-like tolerance size must match target index size" - ): - idx.get_loc( - "2000-01-10", - method="nearest", - tolerance=[ - Timedelta("1 day").to_timedelta64(), - Timedelta("1 day").to_timedelta64(), - ], - ) - # TODO: This method came from test_period; de-dup with version above def test_get_indexer2(self): idx = period_range("2000-01-01", periods=3).asfreq("H", how="start") @@ -778,20 +491,287 @@ def test_get_indexer2(self): ): idx.get_indexer(target, "nearest", tolerance=tol_bad) - def test_indexing(self): - # GH 4390, iat incorrectly indexing - index = period_range("1/1/2001", periods=10) - s = Series(np.random.randn(10), index=index) - expected = s[index[0]] - result = s.iat[0] - assert expected == result - - def test_period_index_indexer(self): - # GH4125 - idx = period_range("2002-01", "2003-12", freq="M") - df = pd.DataFrame(np.random.randn(24, 10), index=idx) - tm.assert_frame_equal(df, df.loc[idx]) - tm.assert_frame_equal(df, df.loc[list(idx)]) - tm.assert_frame_equal(df, df.loc[list(idx)]) - tm.assert_frame_equal(df.iloc[0:5], df.loc[idx[0:5]]) - tm.assert_frame_equal(df, df.loc[list(idx)]) + +class TestWhere: + @pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) + def test_where(self, klass): + i = period_range("20130101", periods=5, freq="D") + cond = [True] * len(i) + expected = i + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) + + cond = [False] + [True] * (len(i) - 1) + expected = PeriodIndex([NaT] + i[1:].tolist(), freq="D") + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) + + def test_where_other(self): + i = period_range("20130101", periods=5, freq="D") + for arr in [np.nan, NaT]: + result = i.where(notna(i), other=np.nan) + expected = i + tm.assert_index_equal(result, expected) + + i2 = i.copy() + i2 = PeriodIndex([NaT, NaT] + i[2:].tolist(), freq="D") + result = i.where(notna(i2), i2) + tm.assert_index_equal(result, i2) + + i2 = i.copy() + i2 = PeriodIndex([NaT, NaT] + i[2:].tolist(), freq="D") + result = i.where(notna(i2), i2.values) + tm.assert_index_equal(result, i2) + + def test_where_invalid_dtypes(self): + pi = period_range("20130101", periods=5, freq="D") + + i2 = pi.copy() + i2 = PeriodIndex([NaT, NaT] + pi[2:].tolist(), freq="D") + + with pytest.raises(TypeError, match="Where requires matching dtype"): + pi.where(notna(i2), i2.asi8) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + pi.where(notna(i2), i2.asi8.view("timedelta64[ns]")) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + pi.where(notna(i2), i2.to_timestamp("S")) + + +class TestTake: + def test_take(self): + # GH#10295 + idx1 = period_range("2011-01-01", "2011-01-31", freq="D", name="idx") + + for idx in [idx1]: + result = idx.take([0]) + assert result == Period("2011-01-01", freq="D") + + result = idx.take([5]) + assert result == Period("2011-01-06", freq="D") + + result = idx.take([0, 1, 2]) + expected = period_range("2011-01-01", "2011-01-03", freq="D", name="idx") + tm.assert_index_equal(result, expected) + assert result.freq == "D" + assert result.freq == expected.freq + + result = idx.take([0, 2, 4]) + expected = PeriodIndex( + ["2011-01-01", "2011-01-03", "2011-01-05"], freq="D", name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + assert result.freq == "D" + + result = idx.take([7, 4, 1]) + expected = PeriodIndex( + ["2011-01-08", "2011-01-05", "2011-01-02"], freq="D", name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + assert result.freq == "D" + + result = idx.take([3, 2, 5]) + expected = PeriodIndex( + ["2011-01-04", "2011-01-03", "2011-01-06"], freq="D", name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + assert result.freq == "D" + + result = idx.take([-3, 2, 5]) + expected = PeriodIndex( + ["2011-01-29", "2011-01-03", "2011-01-06"], freq="D", name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + assert result.freq == "D" + + def test_take_misc(self): + index = period_range(start="1/1/10", end="12/31/12", freq="D", name="idx") + expected = PeriodIndex( + [ + datetime(2010, 1, 6), + datetime(2010, 1, 7), + datetime(2010, 1, 9), + datetime(2010, 1, 13), + ], + freq="D", + name="idx", + ) + + taken1 = index.take([5, 6, 8, 12]) + taken2 = index[[5, 6, 8, 12]] + + for taken in [taken1, taken2]: + tm.assert_index_equal(taken, expected) + assert isinstance(taken, PeriodIndex) + assert taken.freq == index.freq + assert taken.name == expected.name + + def test_take_fill_value(self): + # GH#12631 + idx = PeriodIndex( + ["2011-01-01", "2011-02-01", "2011-03-01"], name="xxx", freq="D" + ) + result = idx.take(np.array([1, 0, -1])) + expected = PeriodIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx", freq="D" + ) + tm.assert_index_equal(result, expected) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = PeriodIndex( + ["2011-02-01", "2011-01-01", "NaT"], name="xxx", freq="D" + ) + tm.assert_index_equal(result, expected) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = PeriodIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx", freq="D" + ) + tm.assert_index_equal(result, expected) + + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + msg = "index -5 is out of bounds for( axis 0 with)? size 3" + with pytest.raises(IndexError, match=msg): + idx.take(np.array([1, -5])) + + +class TestGetValue: + def test_get_value(self): + # GH 17717 + p0 = Period("2017-09-01") + p1 = Period("2017-09-02") + p2 = Period("2017-09-03") + + idx0 = PeriodIndex([p0, p1, p2]) + input0 = pd.Series(np.array([1, 2, 3]), index=idx0) + expected0 = 2 + + result0 = idx0.get_value(input0, p1) + assert result0 == expected0 + + idx1 = PeriodIndex([p1, p1, p2]) + input1 = pd.Series(np.array([1, 2, 3]), index=idx1) + expected1 = input1.iloc[[0, 1]] + + result1 = idx1.get_value(input1, p1) + tm.assert_series_equal(result1, expected1) + + idx2 = PeriodIndex([p1, p2, p1]) + input2 = pd.Series(np.array([1, 2, 3]), index=idx2) + expected2 = input2.iloc[[0, 2]] + + result2 = idx2.get_value(input2, p1) + tm.assert_series_equal(result2, expected2) + + @pytest.mark.parametrize("freq", ["H", "D"]) + def test_get_value_datetime_hourly(self, freq): + # get_loc and get_value should treat datetime objects symmetrically + dti = date_range("2016-01-01", periods=3, freq="MS") + pi = dti.to_period(freq) + ser = pd.Series(range(7, 10), index=pi) + + ts = dti[0] + + assert pi.get_loc(ts) == 0 + assert pi.get_value(ser, ts) == 7 + assert ser[ts] == 7 + assert ser.loc[ts] == 7 + + ts2 = ts + Timedelta(hours=3) + if freq == "H": + with pytest.raises(KeyError, match="2016-01-01 03:00"): + pi.get_loc(ts2) + with pytest.raises(KeyError, match="2016-01-01 03:00"): + pi.get_value(ser, ts2) + with pytest.raises(KeyError, match="2016-01-01 03:00"): + ser[ts2] + with pytest.raises(KeyError, match="2016-01-01 03:00"): + ser.loc[ts2] + else: + assert pi.get_loc(ts2) == 0 + assert pi.get_value(ser, ts2) == 7 + assert ser[ts2] == 7 + assert ser.loc[ts2] == 7 + + def test_get_value_integer(self): + msg = "index 16801 is out of bounds for axis 0 with size 3" + dti = date_range("2016-01-01", periods=3) + pi = dti.to_period("D") + ser = pd.Series(range(3), index=pi) + with pytest.raises(IndexError, match=msg): + pi.get_value(ser, 16801) + + msg = "index 46 is out of bounds for axis 0 with size 3" + pi2 = dti.to_period("Y") # duplicates, ordinals are all 46 + ser2 = pd.Series(range(3), index=pi2) + with pytest.raises(IndexError, match=msg): + pi2.get_value(ser2, 46) + + +class TestContains: + def test_contains(self): + # GH 17717 + p0 = Period("2017-09-01") + p1 = Period("2017-09-02") + p2 = Period("2017-09-03") + p3 = Period("2017-09-04") + + ps0 = [p0, p1, p2] + idx0 = PeriodIndex(ps0) + ser = pd.Series(range(6, 9), index=idx0) + + for p in ps0: + assert p in idx0 + assert str(p) in idx0 + + # GH#31172 + # Higher-resolution period-like are _not_ considered as contained + key = "2017-09-01 00:00:01" + assert key not in idx0 + with pytest.raises(KeyError, match=key): + idx0.get_loc(key) + with pytest.raises(KeyError, match=key): + idx0.get_value(ser, key) + + assert "2017-09" in idx0 + + assert p3 not in idx0 + + +class TestAsOfLocs: + def test_asof_locs_mismatched_type(self): + dti = pd.date_range("2016-01-01", periods=3) + pi = dti.to_period("D") + pi2 = dti.to_period("H") + + mask = np.array([0, 1, 0], dtype=bool) + + msg = "must be DatetimeIndex or PeriodIndex" + with pytest.raises(TypeError, match=msg): + pi.asof_locs(pd.Int64Index(pi.asi8), mask) + + with pytest.raises(TypeError, match=msg): + pi.asof_locs(pd.Float64Index(pi.asi8), mask) + + with pytest.raises(TypeError, match=msg): + # TimedeltaIndex + pi.asof_locs(dti - dti, mask) + + msg = "Input has different freq=H" + with pytest.raises(libperiod.IncompatibleFrequency, match=msg): + pi.asof_locs(pi2, mask) diff --git a/pandas/tests/indexes/period/test_join.py b/pandas/tests/indexes/period/test_join.py index 9e3df0c32d6d5..8a68561dd5819 100644 --- a/pandas/tests/indexes/period/test_join.py +++ b/pandas/tests/indexes/period/test_join.py @@ -39,5 +39,6 @@ def test_join_does_not_recur(self): def test_join_mismatched_freq_raises(self): index = period_range("1/1/2000", "1/20/2000", freq="D") index3 = period_range("1/1/2000", "1/20/2000", freq="2D") - with pytest.raises(IncompatibleFrequency): + msg = r".*Input has different freq=2D from PeriodIndex\(freq=D\)" + with pytest.raises(IncompatibleFrequency, match=msg): index.join(index3) diff --git a/pandas/tests/indexes/period/test_monotonic.py b/pandas/tests/indexes/period/test_monotonic.py new file mode 100644 index 0000000000000..e06e7da1773f5 --- /dev/null +++ b/pandas/tests/indexes/period/test_monotonic.py @@ -0,0 +1,39 @@ +from pandas import Period, PeriodIndex + + +def test_is_monotonic_increasing(): + # GH#17717 + p0 = Period("2017-09-01") + p1 = Period("2017-09-02") + p2 = Period("2017-09-03") + + idx_inc0 = PeriodIndex([p0, p1, p2]) + idx_inc1 = PeriodIndex([p0, p1, p1]) + idx_dec0 = PeriodIndex([p2, p1, p0]) + idx_dec1 = PeriodIndex([p2, p1, p1]) + idx = PeriodIndex([p1, p2, p0]) + + assert idx_inc0.is_monotonic_increasing is True + assert idx_inc1.is_monotonic_increasing is True + assert idx_dec0.is_monotonic_increasing is False + assert idx_dec1.is_monotonic_increasing is False + assert idx.is_monotonic_increasing is False + + +def test_is_monotonic_decreasing(): + # GH#17717 + p0 = Period("2017-09-01") + p1 = Period("2017-09-02") + p2 = Period("2017-09-03") + + idx_inc0 = PeriodIndex([p0, p1, p2]) + idx_inc1 = PeriodIndex([p0, p1, p1]) + idx_dec0 = PeriodIndex([p2, p1, p0]) + idx_dec1 = PeriodIndex([p2, p1, p1]) + idx = PeriodIndex([p1, p2, p0]) + + assert idx_inc0.is_monotonic_decreasing is False + assert idx_inc1.is_monotonic_decreasing is False + assert idx_dec0.is_monotonic_decreasing is True + assert idx_dec1.is_monotonic_decreasing is True + assert idx.is_monotonic_decreasing is False diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 196946e696c8d..fc44226f9d72f 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -81,9 +81,10 @@ def test_value_counts_unique(self): tm.assert_index_equal(idx.unique(), exp_idx) - def test_drop_duplicates_metadata(self): + @pytest.mark.parametrize("freq", ["D", "3D", "H", "2H", "T", "2T", "S", "3S"]) + def test_drop_duplicates_metadata(self, freq): # GH 10115 - idx = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx") + idx = pd.period_range("2011-01-01", periods=10, freq=freq, name="idx") result = idx.drop_duplicates() tm.assert_index_equal(idx, result) assert idx.freq == result.freq @@ -93,26 +94,32 @@ def test_drop_duplicates_metadata(self): tm.assert_index_equal(idx, result) assert idx.freq == result.freq - def test_drop_duplicates(self): + @pytest.mark.parametrize("freq", ["D", "3D", "H", "2H", "T", "2T", "S", "3S"]) + @pytest.mark.parametrize( + "keep, expected, index", + [ + ("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)), + ("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)), + ( + False, + np.concatenate(([True] * 5, [False] * 5, [True] * 5)), + np.arange(5, 10), + ), + ], + ) + def test_drop_duplicates(self, freq, keep, expected, index): # to check Index/Series compat - base = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx") - idx = base.append(base[:5]) - - res = idx.drop_duplicates() - tm.assert_index_equal(res, base) - res = Series(idx).drop_duplicates() - tm.assert_series_equal(res, Series(base)) - - res = idx.drop_duplicates(keep="last") - exp = base[5:].append(base[:5]) - tm.assert_index_equal(res, exp) - res = Series(idx).drop_duplicates(keep="last") - tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) - - res = idx.drop_duplicates(keep=False) - tm.assert_index_equal(res, base[5:]) - res = Series(idx).drop_duplicates(keep=False) - tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + idx = pd.period_range("2011-01-01", periods=10, freq=freq, name="idx") + idx = idx.append(idx[:5]) + + tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected) + expected = idx[~expected] + + result = idx.drop_duplicates(keep=keep) + tm.assert_index_equal(result, expected) + + result = Series(idx).drop_duplicates(keep=keep) + tm.assert_series_equal(result, Series(expected, index=index)) def test_order_compat(self): def _check_freq(index, expected_index): diff --git a/pandas/tests/indexes/period/test_partial_slicing.py b/pandas/tests/indexes/period/test_partial_slicing.py index 833901ea7ba22..ad9ee7bd2594d 100644 --- a/pandas/tests/indexes/period/test_partial_slicing.py +++ b/pandas/tests/indexes/period/test_partial_slicing.py @@ -40,10 +40,6 @@ def test_slice_with_zero_step_raises(self): with pytest.raises(ValueError, match="slice step cannot be zero"): ts.loc[::0] - def test_slice_keep_name(self): - idx = period_range("20010101", periods=10, freq="D", name="bob") - assert idx.name == idx[1:].name - def test_pindex_slice_index(self): pi = period_range(start="1/1/10", end="12/31/12", freq="M") s = Series(np.random.rand(len(pi)), index=pi) @@ -59,6 +55,7 @@ def test_range_slice_day(self): didx = pd.date_range(start="2013/01/01", freq="D", periods=400) pidx = period_range(start="2013/01/01", freq="D", periods=400) + msg = "slice indices must be integers or None or have an __index__ method" for idx in [didx, pidx]: # slices against index should raise IndexError values = [ @@ -69,7 +66,7 @@ def test_range_slice_day(self): "2013/02/01 09:00", ] for v in values: - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): idx[v:] s = Series(np.random.rand(len(idx)), index=idx) @@ -81,13 +78,14 @@ def test_range_slice_day(self): invalid = ["2013/02/01 9H", "2013/02/01 09:00"] for v in invalid: - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): idx[v:] def test_range_slice_seconds(self): # GH#6716 didx = pd.date_range(start="2013/01/01 09:00:00", freq="S", periods=4000) pidx = period_range(start="2013/01/01 09:00:00", freq="S", periods=4000) + msg = "slice indices must be integers or None or have an __index__ method" for idx in [didx, pidx]: # slices against index should raise IndexError @@ -99,7 +97,7 @@ def test_range_slice_seconds(self): "2013/02/01 09:00", ] for v in values: - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): idx[v:] s = Series(np.random.rand(len(idx)), index=idx) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 40c7ffba46450..0ce10fb8779a1 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -35,7 +35,7 @@ class TestPeriodIndex(DatetimeLike): def indices(self, request): return request.param - def create_index(self): + def create_index(self) -> PeriodIndex: return period_range("20130101", periods=5, freq="D") def test_pickle_compat_construction(self): @@ -67,35 +67,6 @@ def test_repeat_freqstr(self, index, use_numpy): tm.assert_index_equal(result, expected) assert result.freqstr == index.freqstr - def test_fillna_period(self): - # GH 11343 - idx = PeriodIndex(["2011-01-01 09:00", NaT, "2011-01-01 11:00"], freq="H") - - exp = PeriodIndex( - ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], freq="H" - ) - tm.assert_index_equal(idx.fillna(Period("2011-01-01 10:00", freq="H")), exp) - - exp = Index( - [ - Period("2011-01-01 09:00", freq="H"), - "x", - Period("2011-01-01 11:00", freq="H"), - ], - dtype=object, - ) - tm.assert_index_equal(idx.fillna("x"), exp) - - exp = Index( - [ - Period("2011-01-01 09:00", freq="H"), - Period("2011-01-01", freq="D"), - Period("2011-01-01 11:00", freq="H"), - ], - dtype=object, - ) - tm.assert_index_equal(idx.fillna(Period("2011-01-01", freq="D")), exp) - def test_no_millisecond_field(self): msg = "type object 'DatetimeIndex' has no attribute 'millisecond'" with pytest.raises(AttributeError, match=msg): @@ -105,19 +76,12 @@ def test_no_millisecond_field(self): with pytest.raises(AttributeError, match=msg): DatetimeIndex([]).millisecond - def test_hash_error(self): - index = period_range("20010101", periods=10) - msg = f"unhashable type: '{type(index).__name__}'" - with pytest.raises(TypeError, match=msg): - hash(index) - def test_make_time_series(self): index = period_range(freq="A", start="1/1/2001", end="12/1/2009") series = Series(1, index=index) assert isinstance(series, Series) def test_shallow_copy_empty(self): - # GH13067 idx = PeriodIndex([], freq="M") result = idx._shallow_copy() @@ -125,11 +89,16 @@ def test_shallow_copy_empty(self): tm.assert_index_equal(result, expected) - def test_shallow_copy_i8(self): + def test_shallow_copy_disallow_i8(self): # GH-24391 pi = period_range("2018-01-01", periods=3, freq="2D") - result = pi._shallow_copy(pi.asi8) - tm.assert_index_equal(result, pi) + with pytest.raises(AssertionError, match="ndarray"): + pi._shallow_copy(pi.asi8) + + def test_shallow_copy_requires_disallow_period_index(self): + pi = period_range("2018-01-01", periods=3, freq="2D") + with pytest.raises(AssertionError, match="PeriodIndex"): + pi._shallow_copy(pi) def test_view_asi8(self): idx = PeriodIndex([], freq="M") @@ -157,7 +126,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.to_numpy(), exp) exp = np.array([], dtype=np.int64) - tm.assert_numpy_array_equal(idx._ndarray_values, exp) + tm.assert_numpy_array_equal(idx.asi8, exp) idx = PeriodIndex(["2011-01", NaT], freq="M") @@ -165,7 +134,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.to_numpy(), exp) exp = np.array([492, -9223372036854775808], dtype=np.int64) - tm.assert_numpy_array_equal(idx._ndarray_values, exp) + tm.assert_numpy_array_equal(idx.asi8, exp) idx = PeriodIndex(["2011-01-01", NaT], freq="D") @@ -173,7 +142,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.to_numpy(), exp) exp = np.array([14975, -9223372036854775808], dtype=np.int64) - tm.assert_numpy_array_equal(idx._ndarray_values, exp) + tm.assert_numpy_array_equal(idx.asi8, exp) def test_period_index_length(self): pi = period_range(freq="A", start="1/1/2001", end="12/1/2009") @@ -677,3 +646,32 @@ def test_is_monotonic_with_nat(): assert not obj.is_monotonic_increasing assert not obj.is_monotonic_decreasing assert obj.is_unique + + +@pytest.mark.parametrize("array", [True, False]) +def test_dunder_array(array): + obj = PeriodIndex(["2000-01-01", "2001-01-01"], freq="D") + if array: + obj = obj._data + + expected = np.array([obj[0], obj[1]], dtype=object) + result = np.array(obj) + tm.assert_numpy_array_equal(result, expected) + + result = np.asarray(obj) + tm.assert_numpy_array_equal(result, expected) + + expected = obj.asi8 + for dtype in ["i8", "int64", np.int64]: + result = np.array(obj, dtype=dtype) + tm.assert_numpy_array_equal(result, expected) + + result = np.asarray(obj, dtype=dtype) + tm.assert_numpy_array_equal(result, expected) + + for dtype in ["float64", "int32", "uint64"]: + msg = "argument must be" + with pytest.raises(TypeError, match=msg): + np.array(obj, dtype=dtype) + with pytest.raises(TypeError, match=msg): + np.array(obj, dtype=getattr(np, dtype)) diff --git a/pandas/tests/indexes/period/test_searchsorted.py b/pandas/tests/indexes/period/test_searchsorted.py new file mode 100644 index 0000000000000..f5a2583bf2e10 --- /dev/null +++ b/pandas/tests/indexes/period/test_searchsorted.py @@ -0,0 +1,77 @@ +import numpy as np +import pytest + +from pandas._libs.tslibs import IncompatibleFrequency + +from pandas import NaT, Period, PeriodIndex, Series, array +import pandas._testing as tm + + +class TestSearchsorted: + @pytest.mark.parametrize("freq", ["D", "2D"]) + def test_searchsorted(self, freq): + pidx = PeriodIndex( + ["2014-01-01", "2014-01-02", "2014-01-03", "2014-01-04", "2014-01-05"], + freq=freq, + ) + + p1 = Period("2014-01-01", freq=freq) + assert pidx.searchsorted(p1) == 0 + + p2 = Period("2014-01-04", freq=freq) + assert pidx.searchsorted(p2) == 3 + + assert pidx.searchsorted(NaT) == 0 + + msg = "Input has different freq=H from PeriodArray" + with pytest.raises(IncompatibleFrequency, match=msg): + pidx.searchsorted(Period("2014-01-01", freq="H")) + + msg = "Input has different freq=5D from PeriodArray" + with pytest.raises(IncompatibleFrequency, match=msg): + pidx.searchsorted(Period("2014-01-01", freq="5D")) + + @pytest.mark.parametrize("klass", [list, np.array, array, Series]) + def test_searchsorted_different_argument_classes(self, klass): + pidx = PeriodIndex( + ["2014-01-01", "2014-01-02", "2014-01-03", "2014-01-04", "2014-01-05"], + freq="D", + ) + result = pidx.searchsorted(klass(pidx)) + expected = np.arange(len(pidx), dtype=result.dtype) + tm.assert_numpy_array_equal(result, expected) + + result = pidx._data.searchsorted(klass(pidx)) + tm.assert_numpy_array_equal(result, expected) + + def test_searchsorted_invalid(self): + pidx = PeriodIndex( + ["2014-01-01", "2014-01-02", "2014-01-03", "2014-01-04", "2014-01-05"], + freq="D", + ) + + other = np.array([0, 1], dtype=np.int64) + + msg = "|".join( + [ + "searchsorted requires compatible dtype or scalar", + "Unexpected type for 'value'", + ] + ) + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(other) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(other.astype("timedelta64[ns]")) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(np.timedelta64(4)) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(np.timedelta64("NaT", "ms")) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(np.datetime64(4, "ns")) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(np.datetime64("NaT", "ns")) diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py index 647d56d33f312..71b827d83b836 100644 --- a/pandas/tests/indexes/period/test_setops.py +++ b/pandas/tests/indexes/period/test_setops.py @@ -13,7 +13,6 @@ def _permute(obj): class TestPeriodIndex: - @pytest.mark.parametrize("sort", [None, False]) def test_union(self, sort): # union other1 = period_range("1/1/2000", freq="D", periods=5) @@ -134,7 +133,6 @@ def test_union(self, sort): expected = expected.sort_values() tm.assert_index_equal(result_union, expected) - @pytest.mark.parametrize("sort", [None, False]) def test_union_misc(self, sort): index = period_range("1/1/2000", "1/20/2000", freq="D") @@ -150,7 +148,8 @@ def test_union_misc(self, sort): # raise if different frequencies index = period_range("1/1/2000", "1/20/2000", freq="D") index2 = period_range("1/1/2000", "1/20/2000", freq="W-WED") - with pytest.raises(IncompatibleFrequency): + msg = r"Input has different freq=W-WED from PeriodIndex\(freq=D\)" + with pytest.raises(IncompatibleFrequency, match=msg): index.union(index2, sort=sort) # TODO: belongs elsewhere @@ -165,7 +164,6 @@ def test_union_dataframe_index(self): exp = period_range("1/1/1980", "1/1/2012", freq="M") tm.assert_index_equal(df.index, exp) - @pytest.mark.parametrize("sort", [None, False]) def test_intersection(self, sort): index = period_range("1/1/2000", "1/20/2000", freq="D") @@ -183,14 +181,15 @@ def test_intersection(self, sort): # raise if different frequencies index = period_range("1/1/2000", "1/20/2000", freq="D") index2 = period_range("1/1/2000", "1/20/2000", freq="W-WED") - with pytest.raises(IncompatibleFrequency): + msg = r"Input has different freq=W-WED from PeriodIndex\(freq=D\)" + with pytest.raises(IncompatibleFrequency, match=msg): index.intersection(index2, sort=sort) index3 = period_range("1/1/2000", "1/20/2000", freq="2D") - with pytest.raises(IncompatibleFrequency): + msg = r"Input has different freq=2D from PeriodIndex\(freq=D\)" + with pytest.raises(IncompatibleFrequency, match=msg): index.intersection(index3, sort=sort) - @pytest.mark.parametrize("sort", [None, False]) def test_intersection_cases(self, sort): base = period_range("6/1/2000", "6/30/2000", freq="D", name="idx") @@ -259,7 +258,6 @@ def test_intersection_cases(self, sort): result = rng.intersection(rng[0:0]) assert len(result) == 0 - @pytest.mark.parametrize("sort", [None, False]) def test_difference(self, sort): # diff period_rng = ["1/3/2000", "1/2/2000", "1/1/2000", "1/5/2000", "1/4/2000"] @@ -324,7 +322,6 @@ def test_difference(self, sort): expected = expected.sort_values() tm.assert_index_equal(result_difference, expected) - @pytest.mark.parametrize("sort", [None, False]) def test_difference_freq(self, sort): # GH14323: difference of Period MUST preserve frequency # but the ability to union results must be preserved diff --git a/pandas/tests/indexes/period/test_shift.py b/pandas/tests/indexes/period/test_shift.py index b4c9810f3a554..278bb7f07c679 100644 --- a/pandas/tests/indexes/period/test_shift.py +++ b/pandas/tests/indexes/period/test_shift.py @@ -63,7 +63,8 @@ def test_shift_corner_cases(self): # GH#9903 idx = PeriodIndex([], name="xxx", freq="H") - with pytest.raises(TypeError): + msg = "`freq` argument is not supported for PeriodArray._time_shift" + with pytest.raises(TypeError, match=msg): # period shift doesn't accept freq idx.shift(1, freq="H") diff --git a/pandas/tests/indexes/period/test_to_timestamp.py b/pandas/tests/indexes/period/test_to_timestamp.py new file mode 100644 index 0000000000000..23787586cb3d3 --- /dev/null +++ b/pandas/tests/indexes/period/test_to_timestamp.py @@ -0,0 +1,101 @@ +from datetime import datetime + +import numpy as np +import pytest + +from pandas import ( + DatetimeIndex, + NaT, + PeriodIndex, + Timedelta, + Timestamp, + date_range, + period_range, +) +import pandas._testing as tm + + +class TestToTimestamp: + def test_to_timestamp_freq(self): + idx = period_range("2017", periods=12, freq="A-DEC") + result = idx.to_timestamp() + expected = date_range("2017", periods=12, freq="AS-JAN") + tm.assert_index_equal(result, expected) + + def test_to_timestamp_pi_nat(self): + # GH#7228 + index = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="M", name="idx") + + result = index.to_timestamp("D") + expected = DatetimeIndex( + [NaT, datetime(2011, 1, 1), datetime(2011, 2, 1)], name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.name == "idx" + + result2 = result.to_period(freq="M") + tm.assert_index_equal(result2, index) + assert result2.name == "idx" + + result3 = result.to_period(freq="3M") + exp = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="3M", name="idx") + tm.assert_index_equal(result3, exp) + assert result3.freqstr == "3M" + + msg = "Frequency must be positive, because it represents span: -2A" + with pytest.raises(ValueError, match=msg): + result.to_period(freq="-2A") + + def test_to_timestamp_preserve_name(self): + index = period_range(freq="A", start="1/1/2001", end="12/1/2009", name="foo") + assert index.name == "foo" + + conv = index.to_timestamp("D") + assert conv.name == "foo" + + def test_to_timestamp_quarterly_bug(self): + years = np.arange(1960, 2000).repeat(4) + quarters = np.tile(list(range(1, 5)), 40) + + pindex = PeriodIndex(year=years, quarter=quarters) + + stamps = pindex.to_timestamp("D", "end") + expected = DatetimeIndex([x.to_timestamp("D", "end") for x in pindex]) + tm.assert_index_equal(stamps, expected) + + def test_to_timestamp_pi_mult(self): + idx = PeriodIndex(["2011-01", "NaT", "2011-02"], freq="2M", name="idx") + + result = idx.to_timestamp() + expected = DatetimeIndex(["2011-01-01", "NaT", "2011-02-01"], name="idx") + tm.assert_index_equal(result, expected) + + result = idx.to_timestamp(how="E") + expected = DatetimeIndex(["2011-02-28", "NaT", "2011-03-31"], name="idx") + expected = expected + Timedelta(1, "D") - Timedelta(1, "ns") + tm.assert_index_equal(result, expected) + + def test_to_timestamp_pi_combined(self): + idx = period_range(start="2011", periods=2, freq="1D1H", name="idx") + + result = idx.to_timestamp() + expected = DatetimeIndex(["2011-01-01 00:00", "2011-01-02 01:00"], name="idx") + tm.assert_index_equal(result, expected) + + result = idx.to_timestamp(how="E") + expected = DatetimeIndex( + ["2011-01-02 00:59:59", "2011-01-03 01:59:59"], name="idx" + ) + expected = expected + Timedelta(1, "s") - Timedelta(1, "ns") + tm.assert_index_equal(result, expected) + + result = idx.to_timestamp(how="E", freq="H") + expected = DatetimeIndex(["2011-01-02 00:00", "2011-01-03 01:00"], name="idx") + expected = expected + Timedelta(1, "h") - Timedelta(1, "ns") + tm.assert_index_equal(result, expected) + + def test_to_timestamp_1703(self): + index = period_range("1/1/2012", periods=4, freq="D") + + result = index.to_timestamp() + assert result[0] == Timestamp("1/1/2012") diff --git a/pandas/tests/indexes/period/test_tools.py b/pandas/tests/indexes/period/test_tools.py index dae220006ebe0..82c13240c6bf2 100644 --- a/pandas/tests/indexes/period/test_tools.py +++ b/pandas/tests/indexes/period/test_tools.py @@ -1,20 +1,7 @@ -from datetime import datetime - import numpy as np import pytest -from pandas._libs.tslibs import IncompatibleFrequency - -from pandas import ( - DatetimeIndex, - NaT, - Period, - PeriodIndex, - Timedelta, - Timestamp, - date_range, - period_range, -) +from pandas import Period, PeriodIndex, period_range import pandas._testing as tm @@ -40,63 +27,6 @@ def test_freq(self, freq): self._check_freq(freq, "1970-01-01") -class TestSearchsorted: - @pytest.mark.parametrize("freq", ["D", "2D"]) - def test_searchsorted(self, freq): - pidx = PeriodIndex( - ["2014-01-01", "2014-01-02", "2014-01-03", "2014-01-04", "2014-01-05"], - freq=freq, - ) - - p1 = Period("2014-01-01", freq=freq) - assert pidx.searchsorted(p1) == 0 - - p2 = Period("2014-01-04", freq=freq) - assert pidx.searchsorted(p2) == 3 - - assert pidx.searchsorted(NaT) == 0 - - msg = "Input has different freq=H from PeriodArray" - with pytest.raises(IncompatibleFrequency, match=msg): - pidx.searchsorted(Period("2014-01-01", freq="H")) - - msg = "Input has different freq=5D from PeriodArray" - with pytest.raises(IncompatibleFrequency, match=msg): - pidx.searchsorted(Period("2014-01-01", freq="5D")) - - def test_searchsorted_invalid(self): - pidx = PeriodIndex( - ["2014-01-01", "2014-01-02", "2014-01-03", "2014-01-04", "2014-01-05"], - freq="D", - ) - - other = np.array([0, 1], dtype=np.int64) - - msg = "|".join( - [ - "searchsorted requires compatible dtype or scalar", - "Unexpected type for 'value'", - ] - ) - with pytest.raises(TypeError, match=msg): - pidx.searchsorted(other) - - with pytest.raises(TypeError, match=msg): - pidx.searchsorted(other.astype("timedelta64[ns]")) - - with pytest.raises(TypeError, match=msg): - pidx.searchsorted(np.timedelta64(4)) - - with pytest.raises(TypeError, match=msg): - pidx.searchsorted(np.timedelta64("NaT", "ms")) - - with pytest.raises(TypeError, match=msg): - pidx.searchsorted(np.datetime64(4, "ns")) - - with pytest.raises(TypeError, match=msg): - pidx.searchsorted(np.datetime64("NaT", "ns")) - - class TestPeriodIndexConversion: def test_tolist(self): index = period_range(freq="A", start="1/1/2001", end="12/1/2009") @@ -106,89 +36,3 @@ def test_tolist(self): recon = PeriodIndex(rs) tm.assert_index_equal(index, recon) - - -class TestToTimestamp: - def test_to_timestamp_freq(self): - idx = period_range("2017", periods=12, freq="A-DEC") - result = idx.to_timestamp() - expected = date_range("2017", periods=12, freq="AS-JAN") - tm.assert_index_equal(result, expected) - - def test_to_timestamp_pi_nat(self): - # GH#7228 - index = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="M", name="idx") - - result = index.to_timestamp("D") - expected = DatetimeIndex( - [NaT, datetime(2011, 1, 1), datetime(2011, 2, 1)], name="idx" - ) - tm.assert_index_equal(result, expected) - assert result.name == "idx" - - result2 = result.to_period(freq="M") - tm.assert_index_equal(result2, index) - assert result2.name == "idx" - - result3 = result.to_period(freq="3M") - exp = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="3M", name="idx") - tm.assert_index_equal(result3, exp) - assert result3.freqstr == "3M" - - msg = "Frequency must be positive, because it represents span: -2A" - with pytest.raises(ValueError, match=msg): - result.to_period(freq="-2A") - - def test_to_timestamp_preserve_name(self): - index = period_range(freq="A", start="1/1/2001", end="12/1/2009", name="foo") - assert index.name == "foo" - - conv = index.to_timestamp("D") - assert conv.name == "foo" - - def test_to_timestamp_quarterly_bug(self): - years = np.arange(1960, 2000).repeat(4) - quarters = np.tile(list(range(1, 5)), 40) - - pindex = PeriodIndex(year=years, quarter=quarters) - - stamps = pindex.to_timestamp("D", "end") - expected = DatetimeIndex([x.to_timestamp("D", "end") for x in pindex]) - tm.assert_index_equal(stamps, expected) - - def test_to_timestamp_pi_mult(self): - idx = PeriodIndex(["2011-01", "NaT", "2011-02"], freq="2M", name="idx") - - result = idx.to_timestamp() - expected = DatetimeIndex(["2011-01-01", "NaT", "2011-02-01"], name="idx") - tm.assert_index_equal(result, expected) - - result = idx.to_timestamp(how="E") - expected = DatetimeIndex(["2011-02-28", "NaT", "2011-03-31"], name="idx") - expected = expected + Timedelta(1, "D") - Timedelta(1, "ns") - tm.assert_index_equal(result, expected) - - def test_to_timestamp_pi_combined(self): - idx = period_range(start="2011", periods=2, freq="1D1H", name="idx") - - result = idx.to_timestamp() - expected = DatetimeIndex(["2011-01-01 00:00", "2011-01-02 01:00"], name="idx") - tm.assert_index_equal(result, expected) - - result = idx.to_timestamp(how="E") - expected = DatetimeIndex( - ["2011-01-02 00:59:59", "2011-01-03 01:59:59"], name="idx" - ) - expected = expected + Timedelta(1, "s") - Timedelta(1, "ns") - tm.assert_index_equal(result, expected) - - result = idx.to_timestamp(how="E", freq="H") - expected = DatetimeIndex(["2011-01-02 00:00", "2011-01-03 01:00"], name="idx") - expected = expected + Timedelta(1, "h") - Timedelta(1, "ns") - tm.assert_index_equal(result, expected) - - def test_to_timestamp_1703(self): - index = period_range("1/1/2012", periods=4, freq="D") - - result = index.to_timestamp() - assert result[0] == Timestamp("1/1/2012") diff --git a/pandas/tests/indexes/ranges/test_constructors.py b/pandas/tests/indexes/ranges/test_constructors.py index ba1de6d551d6b..b7f673428ae38 100644 --- a/pandas/tests/indexes/ranges/test_constructors.py +++ b/pandas/tests/indexes/ranges/test_constructors.py @@ -37,28 +37,36 @@ def test_constructor_invalid_args(self): with pytest.raises(TypeError, match=msg): RangeIndex(name="Foo") - # invalid args - for i in [ + # we don't allow on a bare Index + msg = ( + r"Index\(\.\.\.\) must be called with a collection of some " + r"kind, 0 was passed" + ) + with pytest.raises(TypeError, match=msg): + Index(0) + + @pytest.mark.parametrize( + "args", + [ Index(["a", "b"]), Series(["a", "b"]), np.array(["a", "b"]), [], - "foo", - datetime(2000, 1, 1, 0, 0), np.arange(0, 10), np.array([1]), [1], - ]: - with pytest.raises(TypeError): - RangeIndex(i) + ], + ) + def test_constructor_additional_invalid_args(self, args): + msg = f"Value needs to be a scalar value, was type {type(args).__name__}" + with pytest.raises(TypeError, match=msg): + RangeIndex(args) - # we don't allow on a bare Index - msg = ( - r"Index\(\.\.\.\) must be called with a collection of some " - r"kind, 0 was passed" - ) + @pytest.mark.parametrize("args", ["foo", datetime(2000, 1, 1, 0, 0)]) + def test_constructor_invalid_args_wrong_type(self, args): + msg = f"Wrong type {type(args)} for value {args}" with pytest.raises(TypeError, match=msg): - Index(0, 1000) + RangeIndex(args) def test_constructor_same(self): @@ -81,7 +89,7 @@ def test_constructor_same(self): def test_constructor_range(self): - msg = "Value needs to be a scalar value, was type " + msg = "Value needs to be a scalar value, was type range" with pytest.raises(TypeError, match=msg): result = RangeIndex(range(1, 5, 2)) diff --git a/pandas/tests/indexes/ranges/test_indexing.py b/pandas/tests/indexes/ranges/test_indexing.py new file mode 100644 index 0000000000000..238c33c3db6d7 --- /dev/null +++ b/pandas/tests/indexes/ranges/test_indexing.py @@ -0,0 +1,79 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import RangeIndex +import pandas._testing as tm + + +class TestGetIndexer: + def test_get_indexer(self): + index = RangeIndex(start=0, stop=20, step=2) + target = RangeIndex(10) + indexer = index.get_indexer(target) + expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + def test_get_indexer_pad(self): + index = RangeIndex(start=0, stop=20, step=2) + target = RangeIndex(10) + indexer = index.get_indexer(target, method="pad") + expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + def test_get_indexer_backfill(self): + index = RangeIndex(start=0, stop=20, step=2) + target = RangeIndex(10) + indexer = index.get_indexer(target, method="backfill") + expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected) + + def test_get_indexer_limit(self): + # GH#28631 + idx = RangeIndex(4) + target = RangeIndex(6) + result = idx.get_indexer(target, method="pad", limit=1) + expected = np.array([0, 1, 2, 3, 3, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("stop", [0, -1, -2]) + def test_get_indexer_decreasing(self, stop): + # GH#28678 + index = RangeIndex(7, stop, -3) + result = index.get_indexer(range(9)) + expected = np.array([-1, 2, -1, -1, 1, -1, -1, 0, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + +class TestTake: + def test_take_preserve_name(self): + index = RangeIndex(1, 5, name="foo") + taken = index.take([3, 0, 1]) + assert index.name == taken.name + + def test_take_fill_value(self): + # GH#12631 + idx = pd.RangeIndex(1, 4, name="xxx") + result = idx.take(np.array([1, 0, -1])) + expected = pd.Int64Index([2, 1, 3], name="xxx") + tm.assert_index_equal(result, expected) + + # fill_value + msg = "Unable to fill values because RangeIndex cannot contain NA" + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -1]), fill_value=True) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.Int64Index([2, 1, 3], name="xxx") + tm.assert_index_equal(result, expected) + + msg = "Unable to fill values because RangeIndex cannot contain NA" + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with pytest.raises(ValueError, match=msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + msg = "index -5 is out of bounds for (axis 0 with )?size 3" + with pytest.raises(IndexError, match=msg): + idx.take(np.array([1, -5])) diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index c1cc23039eeaf..05422e7b4419f 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -30,7 +30,7 @@ class TestRangeIndex(Numeric): def indices(self, request): return request.param - def create_index(self): + def create_index(self) -> RangeIndex: return RangeIndex(start=0, stop=20, step=2) def test_can_hold_identifiers(self): @@ -257,43 +257,6 @@ def test_identical(self): assert not index.copy(dtype=object).identical(index.copy(dtype="int64")) - def test_get_indexer(self): - index = self.create_index() - target = RangeIndex(10) - indexer = index.get_indexer(target) - expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1], dtype=np.intp) - tm.assert_numpy_array_equal(indexer, expected) - - def test_get_indexer_pad(self): - index = self.create_index() - target = RangeIndex(10) - indexer = index.get_indexer(target, method="pad") - expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype=np.intp) - tm.assert_numpy_array_equal(indexer, expected) - - def test_get_indexer_backfill(self): - index = self.create_index() - target = RangeIndex(10) - indexer = index.get_indexer(target, method="backfill") - expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) - tm.assert_numpy_array_equal(indexer, expected) - - def test_get_indexer_limit(self): - # GH 28631 - idx = RangeIndex(4) - target = RangeIndex(6) - result = idx.get_indexer(target, method="pad", limit=1) - expected = np.array([0, 1, 2, 3, 3, -1], dtype=np.intp) - tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.parametrize("stop", [0, -1, -2]) - def test_get_indexer_decreasing(self, stop): - # GH 28678 - index = RangeIndex(7, stop, -3) - result = index.get_indexer(range(9)) - expected = np.array([-1, 2, -1, -1, 1, -1, -1, 0, -1], dtype=np.intp) - tm.assert_numpy_array_equal(result, expected) - def test_nbytes(self): # memory savings vs int index @@ -304,14 +267,19 @@ def test_nbytes(self): i2 = RangeIndex(0, 10) assert i.nbytes == i2.nbytes - def test_cant_or_shouldnt_cast(self): - # can't - with pytest.raises(TypeError): - RangeIndex("foo", "bar", "baz") - - # shouldn't - with pytest.raises(TypeError): - RangeIndex("0", "1", "2") + @pytest.mark.parametrize( + "start,stop,step", + [ + # can't + ("foo", "bar", "baz"), + # shouldn't + ("0", "1", "2"), + ], + ) + def test_cant_or_shouldnt_cast(self, start, stop, step): + msg = f"Wrong type {type(start)} for value {start}" + with pytest.raises(TypeError, match=msg): + RangeIndex(start, stop, step) def test_view_index(self): index = self.create_index() @@ -322,41 +290,6 @@ def test_prevent_casting(self): result = index.astype("O") assert result.dtype == np.object_ - def test_take_preserve_name(self): - index = RangeIndex(1, 5, name="foo") - taken = index.take([3, 0, 1]) - assert index.name == taken.name - - def test_take_fill_value(self): - # GH 12631 - idx = pd.RangeIndex(1, 4, name="xxx") - result = idx.take(np.array([1, 0, -1])) - expected = pd.Int64Index([2, 1, 3], name="xxx") - tm.assert_index_equal(result, expected) - - # fill_value - msg = "Unable to fill values because RangeIndex cannot contain NA" - with pytest.raises(ValueError, match=msg): - idx.take(np.array([1, 0, -1]), fill_value=True) - - # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) - expected = pd.Int64Index([2, 1, 3], name="xxx") - tm.assert_index_equal(result, expected) - - msg = "Unable to fill values because RangeIndex cannot contain NA" - with pytest.raises(ValueError, match=msg): - idx.take(np.array([1, 0, -2]), fill_value=True) - with pytest.raises(ValueError, match=msg): - idx.take(np.array([1, 0, -5]), fill_value=True) - - with pytest.raises(IndexError): - idx.take(np.array([1, -5])) - - def test_print_unicode_columns(self): - df = pd.DataFrame({"\u05d0": [1, 2, 3], "\u05d1": [4, 5, 6], "c": [7, 8, 9]}) - repr(df.columns) # should not raise UnicodeDecodeError - def test_repr_roundtrip(self): index = self.create_index() tm.assert_index_equal(eval(repr(index)), index) diff --git a/pandas/tests/indexes/ranges/test_setops.py b/pandas/tests/indexes/ranges/test_setops.py index 8e749e0752087..5b565310cfb9c 100644 --- a/pandas/tests/indexes/ranges/test_setops.py +++ b/pandas/tests/indexes/ranges/test_setops.py @@ -8,7 +8,6 @@ class TestRangeIndexSetOps: - @pytest.mark.parametrize("sort", [None, False]) def test_intersection(self, sort): # intersect with Int64Index index = RangeIndex(start=0, stop=20, step=2) @@ -79,7 +78,6 @@ def test_intersection(self, sort): expected = RangeIndex(0, 0, 1) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("sort", [False, None]) def test_union_noncomparable(self, sort): # corner case, non-Int64Index index = RangeIndex(start=0, stop=20, step=2) diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py index 86881b8984228..8cbea846bc870 100644 --- a/pandas/tests/indexes/test_any_index.py +++ b/pandas/tests/indexes/test_any_index.py @@ -5,6 +5,14 @@ """ import pytest +import pandas._testing as tm + + +def test_boolean_context_compat(indices): + with pytest.raises(ValueError, match="The truth value of a"): + if indices: + pass + def test_sort(indices): msg = "cannot sort an Index object in-place, use sort_values instead" @@ -27,9 +35,58 @@ def test_mutability(indices): def test_wrong_number_names(indices): + names = indices.nlevels * ["apple", "banana", "carrot"] with pytest.raises(ValueError, match="^Length"): - indices.names = ["apple", "banana", "carrot"] + indices.names = names + + +class TestConversion: + def test_to_series(self, indices): + # assert that we are creating a copy of the index + + ser = indices.to_series() + assert ser.values is not indices.values + assert ser.index is not indices + assert ser.name == indices.name + + def test_to_series_with_arguments(self, indices): + # GH#18699 + + # index kwarg + ser = indices.to_series(index=indices) + + assert ser.values is not indices.values + assert ser.index is indices + assert ser.name == indices.name + + # name kwarg + ser = indices.to_series(name="__test") + + assert ser.values is not indices.values + assert ser.index is not indices + assert ser.name != indices.name + + def test_tolist_matches_list(self, indices): + assert indices.tolist() == list(indices) + + +class TestRoundTrips: + def test_pickle_roundtrip(self, indices): + result = tm.round_trip_pickle(indices) + tm.assert_index_equal(result, indices) + if result.nlevels > 1: + # GH#8367 round-trip with timezone + assert indices.equal_levels(result) + + +class TestIndexing: + def test_slice_keeps_name(self, indices): + assert indices.name == indices[1:].name -def test_tolist_matches_list(indices): - assert indices.tolist() == list(indices) +class TestRendering: + def test_str(self, indices): + # test the string repr + indices.name = "foo" + assert "'foo'" in str(indices) + assert type(indices).__name__ in str(indices) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 22f6af2af4aed..1083f1c332705 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -33,7 +33,6 @@ period_range, ) import pandas._testing as tm -from pandas.conftest import indices_dict from pandas.core.indexes.api import ( Index, MultiIndex, @@ -47,19 +46,7 @@ class TestIndex(Base): _holder = Index - @pytest.fixture - def index(self, request): - """ - Fixture for selectively parametrizing indices_dict via indirect parametrization - (parametrize over indices_dict keys with indirect=True). Defaults to string - index if no keys are provided. - """ - key = getattr(request, "param", "string") - - # copy to avoid mutation, e.g. setting .name - return indices_dict[key].copy() - - def create_index(self): + def create_index(self) -> Index: return Index(list("abcde")) def test_can_hold_identifiers(self): @@ -67,33 +54,35 @@ def test_can_hold_identifiers(self): key = index[0] assert index._can_hold_identifiers_and_holds_name(key) is True - @pytest.mark.parametrize("index", ["datetime"], indirect=True) - def test_new_axis(self, index): + @pytest.mark.parametrize("indices", ["datetime"], indirect=True) + def test_new_axis(self, indices): with tm.assert_produces_warning(DeprecationWarning): # GH#30588 multi-dimensional indexing deprecated - new_index = index[None, :] + new_index = indices[None, :] assert new_index.ndim == 2 assert isinstance(new_index, np.ndarray) - @pytest.mark.parametrize("index", ["int", "uint", "float"], indirect=True) - def test_copy_and_deepcopy(self, index): - new_copy2 = index.copy(dtype=int) + @pytest.mark.parametrize("indices", ["int", "uint", "float"], indirect=True) + def test_copy_and_deepcopy(self, indices): + new_copy2 = indices.copy(dtype=int) assert new_copy2.dtype.kind == "i" def test_constructor_regular(self, indices): tm.assert_contains_all(indices, indices) - def test_constructor_casting(self, index): + @pytest.mark.parametrize("indices", ["string"], indirect=True) + def test_constructor_casting(self, indices): # casting - arr = np.array(index) + arr = np.array(indices) new_index = Index(arr) tm.assert_contains_all(arr, new_index) - tm.assert_index_equal(index, new_index) + tm.assert_index_equal(indices, new_index) - def test_constructor_copy(self, index): + @pytest.mark.parametrize("indices", ["string"], indirect=True) + def test_constructor_copy(self, indices): # copy # index = self.create_index() - arr = np.array(index) + arr = np.array(indices) new_index = Index(arr, copy=True, name="name") assert isinstance(new_index, Index) assert new_index.name == "name" @@ -159,17 +148,6 @@ def test_constructor_from_series_dtlike(self, index, has_tz): if has_tz: assert result.tz == index.tz - @pytest.mark.parametrize("klass", [Index, DatetimeIndex]) - def test_constructor_from_series(self, klass): - expected = DatetimeIndex( - [Timestamp("20110101"), Timestamp("20120101"), Timestamp("20130101")] - ) - s = Series( - [Timestamp("20110101"), Timestamp("20120101"), Timestamp("20130101")] - ) - result = klass(s) - tm.assert_index_equal(result, expected) - def test_constructor_from_series_freq(self): # GH 6273 # create from a series, passing a freq @@ -266,47 +244,6 @@ def test_index_ctor_infer_nan_nat(self, klass, dtype, na_val): result = Index(np.array(na_list)) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("pos", [0, 1]) - @pytest.mark.parametrize( - "klass,dtype,ctor", - [ - (pd.DatetimeIndex, "datetime64[ns]", np.datetime64("nat")), - (pd.TimedeltaIndex, "timedelta64[ns]", np.timedelta64("nat")), - ], - ) - def test_index_ctor_infer_nat_dt_like(self, pos, klass, dtype, ctor, nulls_fixture): - expected = klass([pd.NaT, pd.NaT]) - assert expected.dtype == dtype - data = [ctor] - data.insert(pos, nulls_fixture) - - if nulls_fixture is pd.NA: - expected = Index([pd.NA, pd.NaT]) - pytest.xfail("Broken with np.NaT ctor; see GH 31884") - - result = Index(data) - tm.assert_index_equal(result, expected) - - result = Index(np.array(data, dtype=object)) - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize("swap_objs", [True, False]) - def test_index_ctor_nat_result(self, swap_objs): - # mixed np.datetime64/timedelta64 nat results in object - data = [np.datetime64("nat"), np.timedelta64("nat")] - if swap_objs: - data = data[::-1] - - expected = pd.Index(data, dtype=object) - tm.assert_index_equal(Index(data), expected) - tm.assert_index_equal(Index(np.array(data, dtype=object)), expected) - - def test_index_ctor_infer_periodindex(self): - xp = period_range("2012-1-1", freq="M", periods=3) - rs = Index(xp) - tm.assert_index_equal(rs, xp) - assert isinstance(rs, PeriodIndex) - @pytest.mark.parametrize( "vals,dtype", [ @@ -498,7 +435,7 @@ def test_constructor_overflow_int64(self): Index([np.iinfo(np.uint64).max - 1], dtype="int64") @pytest.mark.parametrize( - "index", + "indices", [ "datetime", "float", @@ -512,11 +449,11 @@ def test_constructor_overflow_int64(self): ], indirect=True, ) - def test_view_with_args(self, index): - index.view("i8") + def test_view_with_args(self, indices): + indices.view("i8") @pytest.mark.parametrize( - "index", + "indices", [ "unicode", "string", @@ -526,21 +463,21 @@ def test_view_with_args(self, index): ], indirect=True, ) - def test_view_with_args_object_array_raises(self, index): + def test_view_with_args_object_array_raises(self, indices): msg = "Cannot change data-type for object array" with pytest.raises(TypeError, match=msg): - index.view("i8") + indices.view("i8") - @pytest.mark.parametrize("index", ["int", "range"], indirect=True) - def test_astype(self, index): - casted = index.astype("i8") + @pytest.mark.parametrize("indices", ["int", "range"], indirect=True) + def test_astype(self, indices): + casted = indices.astype("i8") # it works! casted.get_loc(5) # pass on name - index.name = "foobar" - casted = index.astype("i8") + indices.name = "foobar" + casted = indices.astype("i8") assert casted.name == "foobar" def test_equals_object(self): @@ -608,17 +545,17 @@ def test_is_(self): ind2 = Index(arr, copy=False) assert not ind1.is_(ind2) - @pytest.mark.parametrize("index", ["datetime"], indirect=True) - def test_asof(self, index): - d = index[0] - assert index.asof(d) == d - assert isna(index.asof(d - timedelta(1))) + @pytest.mark.parametrize("indices", ["datetime"], indirect=True) + def test_asof(self, indices): + d = indices[0] + assert indices.asof(d) == d + assert isna(indices.asof(d - timedelta(1))) - d = index[-1] - assert index.asof(d + timedelta(1)) == d + d = indices[-1] + assert indices.asof(d + timedelta(1)) == d - d = index[0].to_pydatetime() - assert isinstance(index.asof(d), Timestamp) + d = indices[0].to_pydatetime() + assert isinstance(indices.asof(d), Timestamp) def test_asof_datetime_partial(self): index = pd.date_range("2010-01-01", periods=2, freq="m") @@ -640,16 +577,17 @@ def test_nanosecond_index_access(self): expected_ts = np_datetime64_compat("2013-01-01 00:00:00.000000050+0000", "ns") assert first_value == x[Timestamp(expected_ts)] - def test_booleanindex(self, index): - bool_index = np.ones(len(index), dtype=bool) + @pytest.mark.parametrize("indices", ["string"], indirect=True) + def test_booleanindex(self, indices): + bool_index = np.ones(len(indices), dtype=bool) bool_index[5:30:2] = False - sub_index = index[bool_index] + sub_index = indices[bool_index] for i, val in enumerate(sub_index): assert sub_index.get_loc(val) == i - sub_index = index[list(bool_index)] + sub_index = indices[list(bool_index)] for i, val in enumerate(sub_index): assert sub_index.get_loc(val) == i @@ -659,32 +597,32 @@ def test_fancy(self): for i in sl: assert i == sl[sl.get_loc(i)] - @pytest.mark.parametrize("index", ["string", "int", "float"], indirect=True) + @pytest.mark.parametrize("indices", ["string", "int", "float"], indirect=True) @pytest.mark.parametrize("dtype", [np.int_, np.bool_]) - def test_empty_fancy(self, index, dtype): + def test_empty_fancy(self, indices, dtype): empty_arr = np.array([], dtype=dtype) - empty_index = type(index)([]) + empty_index = type(indices)([]) - assert index[[]].identical(empty_index) - assert index[empty_arr].identical(empty_index) + assert indices[[]].identical(empty_index) + assert indices[empty_arr].identical(empty_index) - @pytest.mark.parametrize("index", ["string", "int", "float"], indirect=True) - def test_empty_fancy_raises(self, index): + @pytest.mark.parametrize("indices", ["string", "int", "float"], indirect=True) + def test_empty_fancy_raises(self, indices): # pd.DatetimeIndex is excluded, because it overrides getitem and should # be tested separately. empty_farr = np.array([], dtype=np.float_) - empty_index = type(index)([]) + empty_index = type(indices)([]) - assert index[[]].identical(empty_index) + assert indices[[]].identical(empty_index) # np.ndarray only accepts ndarray of int & bool dtypes, so should Index msg = r"arrays used as indices must be of integer \(or boolean\) type" with pytest.raises(IndexError, match=msg): - index[empty_farr] + indices[empty_farr] - @pytest.mark.parametrize("sort", [None, False]) - def test_intersection(self, index, sort): - first = index[:20] - second = index[:10] + @pytest.mark.parametrize("indices", ["string"], indirect=True) + def test_intersection(self, indices, sort): + first = indices[:20] + second = indices[:10] intersect = first.intersection(second, sort=sort) if sort is None: tm.assert_index_equal(intersect, second.sort_values()) @@ -702,7 +640,6 @@ def test_intersection(self, index, sort): (Index([3, 4, 5, 6, 7]), False), ], ) - @pytest.mark.parametrize("sort", [None, False]) def test_intersection_name_preservation(self, index2, keeps_name, sort): index1 = Index([1, 2, 3, 4, 5], name="index") expected = Index([3, 4, 5]) @@ -714,16 +651,16 @@ def test_intersection_name_preservation(self, index2, keeps_name, sort): assert result.name == expected.name tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("indices", ["string"], indirect=True) @pytest.mark.parametrize( "first_name,second_name,expected_name", [("A", "A", "A"), ("A", "B", None), (None, "B", None)], ) - @pytest.mark.parametrize("sort", [None, False]) def test_intersection_name_preservation2( - self, index, first_name, second_name, expected_name, sort + self, indices, first_name, second_name, expected_name, sort ): - first = index[5:20] - second = index[:10] + first = indices[5:20] + second = indices[:10] first.name = first_name second.name = second_name intersect = first.intersection(second, sort=sort) @@ -736,7 +673,6 @@ def test_intersection_name_preservation2( (Index([4, 7, 6, 5, 3], name="other"), False), ], ) - @pytest.mark.parametrize("sort", [None, False]) def test_intersection_monotonic(self, index2, keeps_name, sort): index1 = Index([5, 3, 2, 4, 1], name="index") expected = Index([5, 3, 4]) @@ -753,7 +689,6 @@ def test_intersection_monotonic(self, index2, keeps_name, sort): "index2,expected_arr", [(Index(["B", "D"]), ["B"]), (Index(["B", "D", "A"]), ["A", "B", "A"])], ) - @pytest.mark.parametrize("sort", [None, False]) def test_intersection_non_monotonic_non_unique(self, index2, expected_arr, sort): # non-monotonic non-unique index1 = Index(["A", "B", "A", "C"]) @@ -763,7 +698,6 @@ def test_intersection_non_monotonic_non_unique(self, index2, expected_arr, sort) expected = expected.sort_values() tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("sort", [None, False]) def test_intersect_str_dates(self, sort): dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] @@ -780,7 +714,6 @@ def test_intersection_equal_sort_true(self): sorted_ = pd.Index(["a", "b", "c"]) tm.assert_index_equal(idx.intersection(idx, sort=True), sorted_) - @pytest.mark.parametrize("sort", [None, False]) def test_chained_union(self, sort): # Chained unions handles names correctly i1 = Index([1, 2], name="i1") @@ -797,11 +730,11 @@ def test_chained_union(self, sort): expected = j1.union(j2, sort=sort).union(j3, sort=sort) tm.assert_index_equal(union, expected) - @pytest.mark.parametrize("sort", [None, False]) - def test_union(self, index, sort): - first = index[5:20] - second = index[:10] - everything = index[:20] + @pytest.mark.parametrize("indices", ["string"], indirect=True) + def test_union(self, indices, sort): + first = indices[5:20] + second = indices[:10] + everything = indices[:20] union = first.union(second, sort=sort) if sort is None: @@ -835,12 +768,12 @@ def test_union_sort_special_true(self, slice_): tm.assert_index_equal(result, expected) @pytest.mark.parametrize("klass", [np.array, Series, list]) - @pytest.mark.parametrize("sort", [None, False]) - def test_union_from_iterables(self, index, klass, sort): + @pytest.mark.parametrize("indices", ["string"], indirect=True) + def test_union_from_iterables(self, indices, klass, sort): # GH 10149 - first = index[5:20] - second = index[:10] - everything = index[:20] + first = indices[5:20] + second = indices[:10] + everything = indices[:20] case = klass(second.values) result = first.union(case, sort=sort) @@ -848,9 +781,9 @@ def test_union_from_iterables(self, index, klass, sort): tm.assert_index_equal(result, everything.sort_values()) assert tm.equalContents(result, everything) - @pytest.mark.parametrize("sort", [None, False]) - def test_union_identity(self, index, sort): - first = index[5:20] + @pytest.mark.parametrize("indices", ["string"], indirect=True) + def test_union_identity(self, indices, sort): + first = indices[5:20] union = first.union(first, sort=sort) # i.e. identity is not preserved when sort is True @@ -870,7 +803,6 @@ def test_union_identity(self, index, sort): "first_name, second_name, expected_name", [("A", "B", None), (None, "B", None), ("A", None, None)], ) - @pytest.mark.parametrize("sort", [None, False]) def test_union_name_preservation( self, first_list, second_list, first_name, second_name, expected_name, sort ): @@ -887,7 +819,6 @@ def test_union_name_preservation( expected = Index(vals, name=expected_name) assert tm.equalContents(union, expected) - @pytest.mark.parametrize("sort", [None, False]) def test_union_dt_as_obj(self, sort): # TODO: Replace with fixturesult index = self.create_index() @@ -1021,12 +952,12 @@ def test_append_empty_preserve_name(self, name, expected): result = left.append(right) assert result.name == expected + @pytest.mark.parametrize("indices", ["string"], indirect=True) @pytest.mark.parametrize("second_name,expected", [(None, None), ("name", "name")]) - @pytest.mark.parametrize("sort", [None, False]) - def test_difference_name_preservation(self, index, second_name, expected, sort): - first = index[5:20] - second = index[:10] - answer = index[10:20] + def test_difference_name_preservation(self, indices, second_name, expected, sort): + first = indices[5:20] + second = indices[:10] + answer = indices[10:20] first.name = "name" second.name = second_name @@ -1039,38 +970,37 @@ def test_difference_name_preservation(self, index, second_name, expected, sort): else: assert result.name == expected - @pytest.mark.parametrize("sort", [None, False]) - def test_difference_empty_arg(self, index, sort): - first = index[5:20] - first.name == "name" + @pytest.mark.parametrize("indices", ["string"], indirect=True) + def test_difference_empty_arg(self, indices, sort): + first = indices[5:20] + first.name = "name" result = first.difference([], sort) assert tm.equalContents(result, first) assert result.name == first.name - @pytest.mark.parametrize("sort", [None, False]) - def test_difference_identity(self, index, sort): - first = index[5:20] - first.name == "name" + @pytest.mark.parametrize("indices", ["string"], indirect=True) + def test_difference_identity(self, indices, sort): + first = indices[5:20] + first.name = "name" result = first.difference(first, sort) assert len(result) == 0 assert result.name == first.name - @pytest.mark.parametrize("sort", [None, False]) - def test_difference_sort(self, index, sort): - first = index[5:20] - second = index[:10] + @pytest.mark.parametrize("indices", ["string"], indirect=True) + def test_difference_sort(self, indices, sort): + first = indices[5:20] + second = indices[:10] result = first.difference(second, sort) - expected = index[10:20] + expected = indices[10:20] if sort is None: expected = expected.sort_values() tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("sort", [None, False]) def test_symmetric_difference(self, sort): # smoke index1 = Index([5, 2, 3, 4], name="index1") @@ -1118,7 +1048,6 @@ def test_difference_incomparable_true(self, opname): with pytest.raises(TypeError, match="Cannot compare"): op(a) - @pytest.mark.parametrize("sort", [None, False]) def test_symmetric_difference_mi(self, sort): index1 = MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])) index2 = MultiIndex.from_tuples([("foo", 1), ("bar", 3)]) @@ -1136,7 +1065,6 @@ def test_symmetric_difference_mi(self, sort): (Index([0, 1]), Index([np.nan, 2.0, 3.0, 0.0])), ], ) - @pytest.mark.parametrize("sort", [None, False]) def test_symmetric_difference_missing(self, index2, expected, sort): # GH 13514 change: {nan} - {nan} == {} # (GH 6444, sorting of nans, is no longer an issue) @@ -1147,7 +1075,6 @@ def test_symmetric_difference_missing(self, index2, expected, sort): expected = expected.sort_values() tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("sort", [None, False]) def test_symmetric_difference_non_index(self, sort): index1 = Index([1, 2, 3, 4], name="index1") index2 = np.array([2, 3, 4, 5]) @@ -1160,7 +1087,6 @@ def test_symmetric_difference_non_index(self, sort): assert tm.equalContents(result, expected) assert result.name == "new_name" - @pytest.mark.parametrize("sort", [None, False]) def test_difference_type(self, indices, sort): # GH 20040 # If taking difference of a set and itself, it @@ -1171,7 +1097,6 @@ def test_difference_type(self, indices, sort): expected = indices.drop(indices) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("sort", [None, False]) def test_intersection_difference(self, indices, sort): # GH 20040 # Test that the intersection of an index with an @@ -1183,8 +1108,14 @@ def test_intersection_difference(self, indices, sort): diff = indices.difference(indices, sort=sort) tm.assert_index_equal(inter, diff) + def test_is_mixed_deprecated(self): + # GH#32922 + index = self.create_index() + with tm.assert_produces_warning(FutureWarning): + index.is_mixed() + @pytest.mark.parametrize( - "index, expected", + "indices, expected", [ ("string", False), ("bool", False), @@ -1193,13 +1124,13 @@ def test_intersection_difference(self, indices, sort): ("datetime", False), ("float", True), ], - indirect=["index"], + indirect=["indices"], ) - def test_is_numeric(self, index, expected): - assert index.is_numeric() is expected + def test_is_numeric(self, indices, expected): + assert indices.is_numeric() is expected @pytest.mark.parametrize( - "index, expected", + "indices, expected", [ ("string", True), ("bool", True), @@ -1208,13 +1139,13 @@ def test_is_numeric(self, index, expected): ("datetime", False), ("float", False), ], - indirect=["index"], + indirect=["indices"], ) - def test_is_object(self, index, expected): - assert index.is_object() is expected + def test_is_object(self, indices, expected): + assert indices.is_object() is expected @pytest.mark.parametrize( - "index, expected", + "indices, expected", [ ("string", False), ("bool", False), @@ -1223,10 +1154,10 @@ def test_is_object(self, index, expected): ("datetime", True), ("float", False), ], - indirect=["index"], + indirect=["indices"], ) - def test_is_all_dates(self, index, expected): - assert index.is_all_dates is expected + def test_is_all_dates(self, indices, expected): + assert indices.is_all_dates is expected def test_summary(self, indices): self._check_method_works(Index._summary, indices) @@ -1606,37 +1537,37 @@ def test_slice_locs_negative_step(self, in_slice, expected): expected = pd.Index(list(expected)) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("index", ["string", "int", "float"], indirect=True) - def test_drop_by_str_label(self, index): - n = len(index) - drop = index[list(range(5, 10))] - dropped = index.drop(drop) + @pytest.mark.parametrize("indices", ["string", "int", "float"], indirect=True) + def test_drop_by_str_label(self, indices): + n = len(indices) + drop = indices[list(range(5, 10))] + dropped = indices.drop(drop) - expected = index[list(range(5)) + list(range(10, n))] + expected = indices[list(range(5)) + list(range(10, n))] tm.assert_index_equal(dropped, expected) - dropped = index.drop(index[0]) - expected = index[1:] + dropped = indices.drop(indices[0]) + expected = indices[1:] tm.assert_index_equal(dropped, expected) - @pytest.mark.parametrize("index", ["string", "int", "float"], indirect=True) + @pytest.mark.parametrize("indices", ["string", "int", "float"], indirect=True) @pytest.mark.parametrize("keys", [["foo", "bar"], ["1", "bar"]]) - def test_drop_by_str_label_raises_missing_keys(self, index, keys): + def test_drop_by_str_label_raises_missing_keys(self, indices, keys): with pytest.raises(KeyError, match=""): - index.drop(keys) + indices.drop(keys) - @pytest.mark.parametrize("index", ["string", "int", "float"], indirect=True) - def test_drop_by_str_label_errors_ignore(self, index): - n = len(index) - drop = index[list(range(5, 10))] + @pytest.mark.parametrize("indices", ["string", "int", "float"], indirect=True) + def test_drop_by_str_label_errors_ignore(self, indices): + n = len(indices) + drop = indices[list(range(5, 10))] mixed = drop.tolist() + ["foo"] - dropped = index.drop(mixed, errors="ignore") + dropped = indices.drop(mixed, errors="ignore") - expected = index[list(range(5)) + list(range(10, n))] + expected = indices[list(range(5)) + list(range(10, n))] tm.assert_index_equal(dropped, expected) - dropped = index.drop(["foo", "bar"], errors="ignore") - expected = index[list(range(n))] + dropped = indices.drop(["foo", "bar"], errors="ignore") + expected = indices[list(range(n))] tm.assert_index_equal(dropped, expected) def test_drop_by_numeric_label_loc(self): @@ -1756,18 +1687,18 @@ def test_set_value_deprecated(self): assert arr[1] == 80 @pytest.mark.parametrize( - "index", ["string", "int", "datetime", "timedelta"], indirect=True + "indices", ["string", "int", "datetime", "timedelta"], indirect=True ) - def test_get_value(self, index): + def test_get_value(self, indices): # TODO: Remove function? GH 19728 values = np.random.randn(100) - value = index[67] + value = indices[67] with pytest.raises(AttributeError, match="has no attribute '_values'"): # Index.get_value requires a Series, not an ndarray - index.get_value(values, value) + indices.get_value(values, value) - result = index.get_value(Series(values, index=values), value) + result = indices.get_value(Series(values, index=values), value) tm.assert_almost_equal(result, values[67]) @pytest.mark.parametrize("values", [["foo", "bar", "quux"], {"foo", "bar", "quux"}]) @@ -1845,17 +1776,17 @@ def test_isin_level_kwarg(self, level, index): index.name = "foobar" tm.assert_numpy_array_equal(expected, index.isin(values, level="foobar")) - @pytest.mark.parametrize("level", [2, 10, -3]) - def test_isin_level_kwarg_bad_level_raises(self, level, indices): + def test_isin_level_kwarg_bad_level_raises(self, indices): index = indices - with pytest.raises(IndexError, match="Too many levels"): - index.isin([], level=level) + for level in [10, index.nlevels, -(index.nlevels + 1)]: + with pytest.raises(IndexError, match="Too many levels"): + index.isin([], level=level) @pytest.mark.parametrize("label", [1.0, "foobar", "xyzzy", np.nan]) def test_isin_level_kwarg_bad_label_raises(self, label, indices): index = indices if isinstance(index, MultiIndex): - index = index.rename(["foo", "bar"]) + index = index.rename(["foo", "bar"] + index.names[2:]) msg = f"'Level {label} not found'" else: index = index.rename("foo") @@ -1889,9 +1820,10 @@ def test_boolean_cmp(self, values): tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("indices", ["string"], indirect=True) @pytest.mark.parametrize("name,level", [(None, 0), ("a", "a")]) - def test_get_level_values(self, index, name, level): - expected = index.copy() + def test_get_level_values(self, indices, name, level): + expected = indices.copy() if name: expected.name = name @@ -1903,13 +1835,13 @@ def test_slice_keep_name(self): assert index.name == index[1:].name @pytest.mark.parametrize( - "index", + "indices", ["unicode", "string", "datetime", "int", "uint", "float"], indirect=True, ) - def test_join_self(self, index, join_type): - joined = index.join(index, how=join_type) - assert index is joined + def test_join_self(self, indices, join_type): + joined = indices.join(indices, how=join_type) + assert indices is joined @pytest.mark.parametrize("method", ["strip", "rstrip", "lstrip"]) def test_str_attribute(self, method): @@ -2285,7 +2217,8 @@ def test_contains_method_removed(self, indices): if isinstance(indices, pd.IntervalIndex): indices.contains(1) else: - with pytest.raises(AttributeError): + msg = f"'{type(indices).__name__}' object has no attribute 'contains'" + with pytest.raises(AttributeError, match=msg): indices.contains(1) @@ -2299,7 +2232,7 @@ class TestMixedIntIndex(Base): def indices(self, request): return Index(request.param) - def create_index(self): + def create_index(self) -> Index: return Index([0, "a", 1, "b", 2, "c"]) def test_argsort(self): @@ -2459,10 +2392,6 @@ def test_int_name_format(self, klass): result = klass(list(range(3)), index=index) assert "0" in repr(result) - def test_print_unicode_columns(self): - df = pd.DataFrame({"\u05d0": [1, 2, 3], "\u05d1": [4, 5, 6], "c": [7, 8, 9]}) - repr(df.columns) # should not raise UnicodeDecodeError - def test_str_to_bytes_raises(self): # GH 26447 index = Index([str(x) for x in range(10)]) @@ -2480,6 +2409,17 @@ def test_intersect_str_dates(self): expected = Index([], dtype=object) tm.assert_index_equal(result, expected) + def test_index_repr_bool_nan(self): + # GH32146 + arr = Index([True, False, np.nan], dtype=object) + exp1 = arr.format() + out1 = ["True", "False", "NaN"] + assert out1 == exp1 + + exp2 = repr(arr) + out2 = "Index([True, False, nan], dtype='object')" + assert out2 == exp2 + class TestIndexUtils: @pytest.mark.parametrize( @@ -2620,9 +2560,47 @@ def test_convert_almost_null_slice(indices): key = slice(None, None, "foo") if isinstance(idx, pd.IntervalIndex): - with pytest.raises(ValueError, match="cannot support not-default step"): + msg = "label-based slicing with step!=1 is not supported for IntervalIndex" + with pytest.raises(ValueError, match=msg): idx._convert_slice_indexer(key, "loc") else: msg = "'>=' not supported between instances of 'str' and 'int'" with pytest.raises(TypeError, match=msg): idx._convert_slice_indexer(key, "loc") + + +dtlike_dtypes = [ + np.dtype("timedelta64[ns]"), + np.dtype("datetime64[ns]"), + pd.DatetimeTZDtype("ns", "Asia/Tokyo"), + pd.PeriodDtype("ns"), +] + + +@pytest.mark.parametrize("ldtype", dtlike_dtypes) +@pytest.mark.parametrize("rdtype", dtlike_dtypes) +def test_get_indexer_non_unique_wrong_dtype(ldtype, rdtype): + + vals = np.tile(3600 * 10 ** 9 * np.arange(3), 2) + + def construct(dtype): + if dtype is dtlike_dtypes[-1]: + # PeriodArray will try to cast ints to strings + return pd.DatetimeIndex(vals).astype(dtype) + return pd.Index(vals, dtype=dtype) + + left = construct(ldtype) + right = construct(rdtype) + + result = left.get_indexer_non_unique(right) + + if ldtype is rdtype: + ex1 = np.array([0, 3, 1, 4, 2, 5] * 2, dtype=np.intp) + ex2 = np.array([], dtype=np.intp) + tm.assert_numpy_array_equal(result[0], ex1) + tm.assert_numpy_array_equal(result[1], ex2.astype(np.int64)) + + else: + no_matches = np.array([-1] * 6, dtype=np.intp) + tm.assert_numpy_array_equal(result[0], no_matches) + tm.assert_numpy_array_equal(result[1], no_matches) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index b46e6514b4536..01d72670f37aa 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -10,7 +10,7 @@ from pandas._libs.tslibs import iNaT -from pandas.core.dtypes.common import needs_i8_conversion +from pandas.core.dtypes.common import is_period_dtype, needs_i8_conversion import pandas as pd from pandas import CategoricalIndex, MultiIndex, RangeIndex @@ -125,10 +125,6 @@ def test_to_flat_index(self, indices): result = indices.to_flat_index() tm.assert_index_equal(result, indices) - def test_wrong_number_names(self, indices): - with pytest.raises(ValueError, match="^Length"): - indices.names = ["apple", "banana", "carrot"] - def test_set_name_methods(self, indices): new_name = "This is the new name for this index" @@ -219,7 +215,10 @@ def test_get_unique_index(self, indices): if not indices._can_hold_na: pytest.skip("Skip na-check if index cannot hold na") - if needs_i8_conversion(indices): + if is_period_dtype(indices): + vals = indices[[0] * 5]._data + vals[0] = pd.NaT + elif needs_i8_conversion(indices): vals = indices.asi8[[0] * 5] vals[0] = iNaT else: @@ -299,32 +298,65 @@ def test_pickle(self, indices): assert indices.equals(unpickled) indices.name = original_name - @pytest.mark.parametrize("keep", ["first", "last", False]) - def test_duplicated(self, indices, keep): - if not len(indices) or isinstance(indices, (MultiIndex, RangeIndex)): - # MultiIndex tested separately in: - # tests/indexes/multi/test_unique_and_duplicates - pytest.skip("Skip check for empty Index, MultiIndex, RangeIndex") - + def test_drop_duplicates(self, indices, keep): + if isinstance(indices, MultiIndex): + pytest.skip("MultiIndex is tested separately") + if isinstance(indices, RangeIndex): + pytest.skip( + "RangeIndex is tested in test_drop_duplicates_no_duplicates " + "as it cannot hold duplicates" + ) + if len(indices) == 0: + pytest.skip( + "empty index is tested in test_drop_duplicates_no_duplicates " + "as it cannot hold duplicates" + ) + + # make unique index holder = type(indices) + unique_values = list(set(indices)) + unique_idx = holder(unique_values) - idx = holder(indices) - if idx.has_duplicates: - # We are testing the duplicated-method here, so we need to know - # exactly which indices are duplicate and how (for the result). - # This is not possible if "idx" has duplicates already, which we - # therefore remove. This is seemingly circular, as drop_duplicates - # invokes duplicated, but in the end, it all works out because we - # cross-check with Series.duplicated, which is tested separately. - idx = idx.drop_duplicates() + # make duplicated index + n = len(unique_idx) + duplicated_selection = np.random.choice(n, int(n * 1.5)) + idx = holder(unique_idx.values[duplicated_selection]) - n, k = len(idx), 10 - duplicated_selection = np.random.choice(n, k * n) - expected = pd.Series(duplicated_selection).duplicated(keep=keep).values - idx = holder(idx.values[duplicated_selection]) + # Series.duplicated is tested separately + expected_duplicated = ( + pd.Series(duplicated_selection).duplicated(keep=keep).values + ) + tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected_duplicated) - result = idx.duplicated(keep=keep) - tm.assert_numpy_array_equal(result, expected) + # Series.drop_duplicates is tested separately + expected_dropped = holder(pd.Series(idx).drop_duplicates(keep=keep)) + tm.assert_index_equal(idx.drop_duplicates(keep=keep), expected_dropped) + + def test_drop_duplicates_no_duplicates(self, indices): + if isinstance(indices, MultiIndex): + pytest.skip("MultiIndex is tested separately") + + # make unique index + if isinstance(indices, RangeIndex): + # RangeIndex cannot have duplicates + unique_idx = indices + else: + holder = type(indices) + unique_values = list(set(indices)) + unique_idx = holder(unique_values) + + # check on unique index + expected_duplicated = np.array([False] * len(unique_idx), dtype="bool") + tm.assert_numpy_array_equal(unique_idx.duplicated(), expected_duplicated) + result_dropped = unique_idx.drop_duplicates() + tm.assert_index_equal(result_dropped, unique_idx) + # validate shallow copy + assert result_dropped is not unique_idx + + def test_drop_duplicates_inplace(self, indices): + msg = r"drop_duplicates\(\) got an unexpected keyword argument" + with pytest.raises(TypeError, match=msg): + indices.drop_duplicates(inplace=True) def test_has_duplicates(self, indices): holder = type(indices) @@ -337,3 +369,29 @@ def test_has_duplicates(self, indices): idx = holder([indices[0]] * 5) assert idx.is_unique is False assert idx.has_duplicates is True + + @pytest.mark.parametrize( + "dtype", + ["int64", "uint64", "float64", "category", "datetime64[ns]", "timedelta64[ns]"], + ) + @pytest.mark.parametrize("copy", [True, False]) + def test_astype_preserves_name(self, indices, dtype, copy): + # https://github.com/pandas-dev/pandas/issues/32013 + if isinstance(indices, MultiIndex): + indices.names = ["idx" + str(i) for i in range(indices.nlevels)] + else: + indices.name = "idx" + + try: + # Some of these conversions cannot succeed so we use a try / except + if copy: + result = indices.copy(dtype=dtype) + else: + result = indices.astype(dtype) + except (ValueError, TypeError, NotImplementedError, SystemError): + return + + if isinstance(indices, MultiIndex): + assert result.names == indices.names + else: + assert result.name == indices.name diff --git a/pandas/tests/indexes/test_frozen.py b/pandas/tests/indexes/test_frozen.py index 2e53e29c3fab1..cde3fc00eaaaa 100644 --- a/pandas/tests/indexes/test_frozen.py +++ b/pandas/tests/indexes/test_frozen.py @@ -17,7 +17,8 @@ def check_mutable_error(self, *args, **kwargs): # Pass whatever function you normally would to pytest.raises # (after the Exception kind). mutable_regex = re.compile("does not support mutable operations") - with pytest.raises(TypeError): + msg = "'(_s)?re.(SRE_)?Pattern' object is not callable" + with pytest.raises(TypeError, match=msg): mutable_regex(*args, **kwargs) def test_no_mutable_funcs(self): diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index 33f61de6a4ebf..87df5959e6221 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -6,7 +6,21 @@ from pandas.core.dtypes.common import is_unsigned_integer_dtype -from pandas import CategoricalIndex, Index, Int64Index, MultiIndex, UInt64Index +from pandas import ( + NA, + CategoricalIndex, + DatetimeIndex, + Index, + Int64Index, + MultiIndex, + NaT, + PeriodIndex, + Series, + TimedeltaIndex, + Timestamp, + UInt64Index, + period_range, +) import pandas._testing as tm @@ -53,3 +67,58 @@ def test_constructor_categorical_to_object(self): ci = CategoricalIndex(range(5)) result = Index(ci, dtype=object) assert not isinstance(result, CategoricalIndex) + + def test_constructor_infer_periodindex(self): + xp = period_range("2012-1-1", freq="M", periods=3) + rs = Index(xp) + tm.assert_index_equal(rs, xp) + assert isinstance(rs, PeriodIndex) + + @pytest.mark.parametrize("pos", [0, 1]) + @pytest.mark.parametrize( + "klass,dtype,ctor", + [ + (DatetimeIndex, "datetime64[ns]", np.datetime64("nat")), + (TimedeltaIndex, "timedelta64[ns]", np.timedelta64("nat")), + ], + ) + def test_constructor_infer_nat_dt_like( + self, pos, klass, dtype, ctor, nulls_fixture + ): + expected = klass([NaT, NaT]) + assert expected.dtype == dtype + data = [ctor] + data.insert(pos, nulls_fixture) + + if nulls_fixture is NA: + expected = Index([NA, NaT]) + pytest.xfail("Broken with np.NaT ctor; see GH 31884") + + result = Index(data) + tm.assert_index_equal(result, expected) + + result = Index(np.array(data, dtype=object)) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("swap_objs", [True, False]) + def test_constructor_mixed_nat_objs_infers_object(self, swap_objs): + # mixed np.datetime64/timedelta64 nat results in object + data = [np.datetime64("nat"), np.timedelta64("nat")] + if swap_objs: + data = data[::-1] + + expected = Index(data, dtype=object) + tm.assert_index_equal(Index(data), expected) + tm.assert_index_equal(Index(np.array(data, dtype=object)), expected) + + +class TestIndexConstructorUnwrapping: + # Test passing different arraylike values to pd.Index + + @pytest.mark.parametrize("klass", [Index, DatetimeIndex]) + def test_constructor_from_series_dt64(self, klass): + stamps = [Timestamp("20110101"), Timestamp("20120101"), Timestamp("20130101")] + expected = DatetimeIndex(stamps) + ser = Series(stamps) + result = klass(ser) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_indexing.py b/pandas/tests/indexes/test_indexing.py new file mode 100644 index 0000000000000..a79bde9fd04e1 --- /dev/null +++ b/pandas/tests/indexes/test_indexing.py @@ -0,0 +1,84 @@ +""" +test_indexing tests the following Index methods: + __getitem__ + get_loc + get_value + __contains__ + take + where + get_indexer + slice_locs + asof_locs + +The corresponding tests.indexes.[index_type].test_indexing files +contain tests for the corresponding methods specific to those Index subclasses. +""" +import numpy as np +import pytest + +from pandas import Float64Index, Index, Int64Index, UInt64Index + + +class TestContains: + @pytest.mark.parametrize( + "index,val", + [ + (Index([0, 1, 2]), 2), + (Index([0, 1, "2"]), "2"), + (Index([0, 1, 2, np.inf, 4]), 4), + (Index([0, 1, 2, np.nan, 4]), 4), + (Index([0, 1, 2, np.inf]), np.inf), + (Index([0, 1, 2, np.nan]), np.nan), + ], + ) + def test_index_contains(self, index, val): + assert val in index + + @pytest.mark.parametrize( + "index,val", + [ + (Index([0, 1, 2]), "2"), + (Index([0, 1, "2"]), 2), + (Index([0, 1, 2, np.inf]), 4), + (Index([0, 1, 2, np.nan]), 4), + (Index([0, 1, 2, np.inf]), np.nan), + (Index([0, 1, 2, np.nan]), np.inf), + # Checking if np.inf in Int64Index should not cause an OverflowError + # Related to GH 16957 + (Int64Index([0, 1, 2]), np.inf), + (Int64Index([0, 1, 2]), np.nan), + (UInt64Index([0, 1, 2]), np.inf), + (UInt64Index([0, 1, 2]), np.nan), + ], + ) + def test_index_not_contains(self, index, val): + assert val not in index + + @pytest.mark.parametrize( + "index,val", [(Index([0, 1, "2"]), 0), (Index([0, 1, "2"]), "2")] + ) + def test_mixed_index_contains(self, index, val): + # GH#19860 + assert val in index + + @pytest.mark.parametrize( + "index,val", [(Index([0, 1, "2"]), "1"), (Index([0, 1, "2"]), 2)] + ) + def test_mixed_index_not_contains(self, index, val): + # GH#19860 + assert val not in index + + def test_contains_with_float_index(self): + # GH#22085 + integer_index = Int64Index([0, 1, 2, 3]) + uinteger_index = UInt64Index([0, 1, 2, 3]) + float_index = Float64Index([0.1, 1.1, 2.2, 3.3]) + + for index in (integer_index, uinteger_index): + assert 1.1 not in index + assert 1.0 in index + assert 1 in index + + assert 1.1 in float_index + assert 1.0 not in float_index + assert 1 not in float_index diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 10d57d8616cf3..35a1cbd141c89 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -14,6 +14,10 @@ class Numeric(Base): + def test_where(self): + # Tested in numeric.test_indexing + pass + def test_can_hold_identifiers(self): idx = self.create_index() key = idx[0] @@ -75,18 +79,6 @@ def test_index_groupby(self): expected = {ex_keys[0]: idx[[0, 5]], ex_keys[1]: idx[[1, 4]]} tm.assert_dict_equal(idx.groupby(to_groupby), expected) - @pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) - def test_where(self, klass): - i = self.create_index() - cond = [True] * len(i) - expected = i - result = i.where(klass(cond)) - - cond = [False] + [True] * (len(i) - 1) - expected = Float64Index([i._na_value] + i[1:].tolist()) - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) - def test_insert(self, nulls_fixture): # GH 18295 (test missing) index = self.create_index() @@ -118,7 +110,7 @@ def mixed_index(self): def float_index(self): return Float64Index([0.0, 2.5, 5.0, 7.5, 10.0]) - def create_index(self): + def create_index(self) -> Float64Index: return Float64Index(np.arange(5, dtype="float64")) def test_repr_roundtrip(self, indices): @@ -310,89 +302,6 @@ def test_equals_numeric(self): i2 = Float64Index([1.0, np.nan]) assert i.equals(i2) - def test_get_indexer(self): - idx = Float64Index([0.0, 1.0, 2.0]) - tm.assert_numpy_array_equal( - idx.get_indexer(idx), np.array([0, 1, 2], dtype=np.intp) - ) - - target = [-0.1, 0.5, 1.1] - tm.assert_numpy_array_equal( - idx.get_indexer(target, "pad"), np.array([-1, 0, 1], dtype=np.intp) - ) - tm.assert_numpy_array_equal( - idx.get_indexer(target, "backfill"), np.array([0, 1, 2], dtype=np.intp) - ) - tm.assert_numpy_array_equal( - idx.get_indexer(target, "nearest"), np.array([0, 1, 1], dtype=np.intp) - ) - - def test_get_loc(self): - idx = Float64Index([0.0, 1.0, 2.0]) - for method in [None, "pad", "backfill", "nearest"]: - assert idx.get_loc(1, method) == 1 - if method is not None: - assert idx.get_loc(1, method, tolerance=0) == 1 - - for method, loc in [("pad", 1), ("backfill", 2), ("nearest", 1)]: - assert idx.get_loc(1.1, method) == loc - assert idx.get_loc(1.1, method, tolerance=0.9) == loc - - with pytest.raises(KeyError, match="^'foo'$"): - idx.get_loc("foo") - with pytest.raises(KeyError, match=r"^1\.5$"): - idx.get_loc(1.5) - with pytest.raises(KeyError, match=r"^1\.5$"): - idx.get_loc(1.5, method="pad", tolerance=0.1) - with pytest.raises(KeyError, match="^True$"): - idx.get_loc(True) - with pytest.raises(KeyError, match="^False$"): - idx.get_loc(False) - - with pytest.raises(ValueError, match="must be numeric"): - idx.get_loc(1.4, method="nearest", tolerance="foo") - - with pytest.raises(ValueError, match="must contain numeric elements"): - idx.get_loc(1.4, method="nearest", tolerance=np.array(["foo"])) - - with pytest.raises( - ValueError, match="tolerance size must match target index size" - ): - idx.get_loc(1.4, method="nearest", tolerance=np.array([1, 2])) - - def test_get_loc_na(self): - idx = Float64Index([np.nan, 1, 2]) - assert idx.get_loc(1) == 1 - assert idx.get_loc(np.nan) == 0 - - idx = Float64Index([np.nan, 1, np.nan]) - assert idx.get_loc(1) == 1 - - # representable by slice [0:2:2] - # pytest.raises(KeyError, idx.slice_locs, np.nan) - sliced = idx.slice_locs(np.nan) - assert isinstance(sliced, tuple) - assert sliced == (0, 3) - - # not representable by slice - idx = Float64Index([np.nan, 1, np.nan, np.nan]) - assert idx.get_loc(1) == 1 - msg = "'Cannot get left slice bound for non-unique label: nan" - with pytest.raises(KeyError, match=msg): - idx.slice_locs(np.nan) - - def test_get_loc_missing_nan(self): - # GH 8569 - idx = Float64Index([1, 2]) - assert idx.get_loc(1) == 0 - with pytest.raises(KeyError, match=r"^3$"): - idx.get_loc(3) - with pytest.raises(KeyError, match="^nan$"): - idx.get_loc(np.nan) - with pytest.raises(TypeError, match=r"'\[nan\]' is an invalid key"): - # listlike/non-hashable raises TypeError - idx.get_loc([np.nan]) - @pytest.mark.parametrize( "vals", [ @@ -435,14 +344,6 @@ def test_lookups_datetimelike_values(self, vals): result = ser.iat[1] assert isinstance(result, type(expected)) and result == expected - def test_contains_nans(self): - i = Float64Index([1.0, 2.0, np.nan]) - assert np.nan in i - - def test_contains_not_nans(self): - i = Float64Index([1.0, 2.0, np.nan]) - assert 1.0 in i - def test_doesnt_contain_all_the_things(self): i = Float64Index([np.nan]) assert not i.isin([0]).item() @@ -480,35 +381,6 @@ def test_fillna_float64(self): exp = Index([1.0, "obj", 3.0], name="x") tm.assert_index_equal(idx.fillna("obj"), exp) - def test_take_fill_value(self): - # GH 12631 - idx = pd.Float64Index([1.0, 2.0, 3.0], name="xxx") - result = idx.take(np.array([1, 0, -1])) - expected = pd.Float64Index([2.0, 1.0, 3.0], name="xxx") - tm.assert_index_equal(result, expected) - - # fill_value - result = idx.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.Float64Index([2.0, 1.0, np.nan], name="xxx") - tm.assert_index_equal(result, expected) - - # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) - expected = pd.Float64Index([2.0, 1.0, 3.0], name="xxx") - tm.assert_index_equal(result, expected) - - msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" - ) - with pytest.raises(ValueError, match=msg): - idx.take(np.array([1, 0, -2]), fill_value=True) - with pytest.raises(ValueError, match=msg): - idx.take(np.array([1, 0, -5]), fill_value=True) - - with pytest.raises(IndexError): - idx.take(np.array([1, -5])) - class NumericInt(Numeric): def test_view(self): @@ -616,42 +488,6 @@ def test_prevent_casting(self): result = index.astype("O") assert result.dtype == np.object_ - def test_take_preserve_name(self): - index = self._holder([1, 2, 3, 4], name="foo") - taken = index.take([3, 0, 1]) - assert index.name == taken.name - - def test_take_fill_value(self): - # see gh-12631 - idx = self._holder([1, 2, 3], name="xxx") - result = idx.take(np.array([1, 0, -1])) - expected = self._holder([2, 1, 3], name="xxx") - tm.assert_index_equal(result, expected) - - name = self._holder.__name__ - msg = f"Unable to fill values because {name} cannot contain NA" - - # fill_value=True - with pytest.raises(ValueError, match=msg): - idx.take(np.array([1, 0, -1]), fill_value=True) - - # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) - expected = self._holder([2, 1, 3], name="xxx") - tm.assert_index_equal(result, expected) - - with pytest.raises(ValueError, match=msg): - idx.take(np.array([1, 0, -2]), fill_value=True) - with pytest.raises(ValueError, match=msg): - idx.take(np.array([1, 0, -5]), fill_value=True) - - with pytest.raises(IndexError): - idx.take(np.array([1, -5])) - - def test_slice_keep_name(self): - idx = self._holder([1, 2], name="asdf") - assert idx.name == idx[1:].name - class TestInt64Index(NumericInt): _dtype = "int64" @@ -663,7 +499,7 @@ class TestInt64Index(NumericInt): def indices(self, request): return Int64Index(request.param) - def create_index(self): + def create_index(self) -> Int64Index: # return Int64Index(np.arange(5, dtype="int64")) return Int64Index(range(0, 20, 2)) @@ -743,29 +579,6 @@ def test_coerce_list(self): arr = Index([1, 2, 3, 4], dtype=object) assert isinstance(arr, Index) - def test_get_indexer(self): - index = self.create_index() - target = Int64Index(np.arange(10)) - indexer = index.get_indexer(target) - expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1], dtype=np.intp) - tm.assert_numpy_array_equal(indexer, expected) - - target = Int64Index(np.arange(10)) - indexer = index.get_indexer(target, method="pad") - expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype=np.intp) - tm.assert_numpy_array_equal(indexer, expected) - - target = Int64Index(np.arange(10)) - indexer = index.get_indexer(target, method="backfill") - expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) - tm.assert_numpy_array_equal(indexer, expected) - - def test_get_indexer_nan(self): - # GH 7820 - result = Index([1, 2, np.nan]).get_indexer([np.nan]) - expected = np.array([2], dtype=np.intp) - tm.assert_numpy_array_equal(result, expected) - def test_intersection(self): index = self.create_index() other = Index([1, 2, 3, 4, 5]) @@ -801,7 +614,7 @@ def index_large(self): large = [2 ** 63, 2 ** 63 + 10, 2 ** 63 + 15, 2 ** 63 + 20, 2 ** 63 + 25] return UInt64Index(large) - def create_index(self): + def create_index(self) -> UInt64Index: # compat with shared Int64/Float64 tests; use index_large for UInt64 only tests return UInt64Index(np.arange(5, dtype="uint64")) @@ -827,22 +640,6 @@ def test_constructor(self): res = Index([1, 2 ** 63 + 1], dtype=np.uint64) tm.assert_index_equal(res, idx) - def test_get_indexer(self, index_large): - target = UInt64Index(np.arange(10).astype("uint64") * 5 + 2 ** 63) - indexer = index_large.get_indexer(target) - expected = np.array([0, -1, 1, 2, 3, 4, -1, -1, -1, -1], dtype=np.intp) - tm.assert_numpy_array_equal(indexer, expected) - - target = UInt64Index(np.arange(10).astype("uint64") * 5 + 2 ** 63) - indexer = index_large.get_indexer(target, method="pad") - expected = np.array([0, 0, 1, 2, 3, 4, 4, 4, 4, 4], dtype=np.intp) - tm.assert_numpy_array_equal(indexer, expected) - - target = UInt64Index(np.arange(10).astype("uint64") * 5 + 2 ** 63) - indexer = index_large.get_indexer(target, method="backfill") - expected = np.array([0, 1, 1, 2, 3, 4, -1, -1, -1, -1], dtype=np.intp) - tm.assert_numpy_array_equal(indexer, expected) - def test_intersection(self, index_large): other = Index([2 ** 63, 2 ** 63 + 5, 2 ** 63 + 10, 2 ** 63 + 15, 2 ** 63 + 20]) result = index_large.intersection(other) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index d0cbb2ab75f72..818d5474eddf5 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -2,8 +2,6 @@ The tests in this package are to ensure the proper resultant dtypes of set operations. """ -import itertools as it - import numpy as np import pytest @@ -13,7 +11,6 @@ from pandas import Float64Index, Int64Index, RangeIndex, UInt64Index import pandas._testing as tm from pandas.api.types import pandas_dtype -from pandas.conftest import indices_dict COMPATIBLE_INCONSISTENT_PAIRS = { (Int64Index, RangeIndex): (tm.makeIntIndex, tm.makeRangeIndex), @@ -23,14 +20,6 @@ } -@pytest.fixture(params=it.combinations(indices_dict, 2), ids="-".join) -def index_pair(request): - """ - Create all combinations of 2 index types. - """ - return indices_dict[request.param[0]], indices_dict[request.param[1]] - - def test_union_same_types(indices): # Union with a non-unique, non-monotonic index raises error # Only needed for bool index factory @@ -39,14 +28,15 @@ def test_union_same_types(indices): assert idx1.union(idx2).dtype == idx1.dtype -def test_union_different_types(index_pair): +def test_union_different_types(indices, index_fixture2): + # This test only considers combinations of indices # GH 23525 - idx1, idx2 = index_pair + idx1, idx2 = indices, index_fixture2 type_pair = tuple(sorted([type(idx1), type(idx2)], key=lambda x: str(x))) if type_pair in COMPATIBLE_INCONSISTENT_PAIRS: pytest.xfail("This test only considers non compatible indexes.") - if any(isinstance(idx, pd.MultiIndex) for idx in index_pair): + if any(isinstance(idx, pd.MultiIndex) for idx in (idx1, idx2)): pytest.xfail("This test doesn't consider multiindixes.") if is_dtype_equal(idx1.dtype, idx2.dtype): diff --git a/pandas/tests/indexes/timedeltas/test_astype.py b/pandas/tests/indexes/timedeltas/test_astype.py index 82c9d995c9c7c..d9f24b4a35520 100644 --- a/pandas/tests/indexes/timedeltas/test_astype.py +++ b/pandas/tests/indexes/timedeltas/test_astype.py @@ -47,20 +47,22 @@ def test_astype_object_with_nat(self): def test_astype(self): # GH 13149, GH 13209 - idx = TimedeltaIndex([1e14, "NaT", NaT, np.NaN]) + idx = TimedeltaIndex([1e14, "NaT", NaT, np.NaN], name="idx") result = idx.astype(object) - expected = Index([Timedelta("1 days 03:46:40")] + [NaT] * 3, dtype=object) + expected = Index( + [Timedelta("1 days 03:46:40")] + [NaT] * 3, dtype=object, name="idx" + ) tm.assert_index_equal(result, expected) result = idx.astype(int) expected = Int64Index( - [100000000000000] + [-9223372036854775808] * 3, dtype=np.int64 + [100000000000000] + [-9223372036854775808] * 3, dtype=np.int64, name="idx" ) tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index(str(x) for x in idx) + expected = Index([str(x) for x in idx], name="idx") tm.assert_index_equal(result, expected) rng = timedelta_range("1 days", periods=10) diff --git a/pandas/tests/indexes/timedeltas/test_constructors.py b/pandas/tests/indexes/timedeltas/test_constructors.py index 8e54561df1624..3e5bb56c3e58e 100644 --- a/pandas/tests/indexes/timedeltas/test_constructors.py +++ b/pandas/tests/indexes/timedeltas/test_constructors.py @@ -168,7 +168,11 @@ def test_constructor_coverage(self): with pytest.raises(TypeError, match=msg): timedelta_range(start="1 days", periods="foo", freq="D") - with pytest.raises(TypeError): + msg = ( + r"TimedeltaIndex\(\) must be called with a collection of some kind, " + "'1 days' was passed" + ) + with pytest.raises(TypeError, match=msg): TimedeltaIndex("1 days") # generator expression @@ -220,5 +224,6 @@ def test_constructor_no_precision_raises(self): pd.Index(["2000"], dtype="timedelta64") def test_constructor_wrong_precision_raises(self): - with pytest.raises(ValueError): + msg = r"dtype timedelta64\[us\] cannot be converted to timedelta64\[ns\]" + with pytest.raises(ValueError, match=msg): pd.TimedeltaIndex(["2000"], dtype="timedelta64[us]") diff --git a/pandas/tests/indexes/timedeltas/test_delete.py b/pandas/tests/indexes/timedeltas/test_delete.py new file mode 100644 index 0000000000000..63f2b450aa818 --- /dev/null +++ b/pandas/tests/indexes/timedeltas/test_delete.py @@ -0,0 +1,68 @@ +from pandas import TimedeltaIndex, timedelta_range +import pandas._testing as tm + + +class TestTimedeltaIndexDelete: + def test_delete(self): + idx = timedelta_range(start="1 Days", periods=5, freq="D", name="idx") + + # preserve freq + expected_0 = timedelta_range(start="2 Days", periods=4, freq="D", name="idx") + expected_4 = timedelta_range(start="1 Days", periods=4, freq="D", name="idx") + + # reset freq to None + expected_1 = TimedeltaIndex( + ["1 day", "3 day", "4 day", "5 day"], freq=None, name="idx" + ) + + cases = { + 0: expected_0, + -5: expected_0, + -1: expected_4, + 4: expected_4, + 1: expected_1, + } + for n, expected in cases.items(): + result = idx.delete(n) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + with tm.external_error_raised((IndexError, ValueError)): + # either depending on numpy version + idx.delete(5) + + def test_delete_slice(self): + idx = timedelta_range(start="1 days", periods=10, freq="D", name="idx") + + # preserve freq + expected_0_2 = timedelta_range(start="4 days", periods=7, freq="D", name="idx") + expected_7_9 = timedelta_range(start="1 days", periods=7, freq="D", name="idx") + + # reset freq to None + expected_3_5 = TimedeltaIndex( + ["1 d", "2 d", "3 d", "7 d", "8 d", "9 d", "10d"], freq=None, name="idx" + ) + + cases = { + (0, 1, 2): expected_0_2, + (7, 8, 9): expected_7_9, + (3, 4, 5): expected_3_5, + } + for n, expected in cases.items(): + result = idx.delete(n) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + result = idx.delete(slice(n[0], n[-1] + 1)) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + def test_delete_doesnt_infer_freq(self): + # GH#30655 behavior matches DatetimeIndex + + tdi = TimedeltaIndex(["1 Day", "2 Days", None, "3 Days", "4 Days"]) + result = tdi.delete(2) + assert result.freq is None diff --git a/pandas/tests/indexes/timedeltas/test_fillna.py b/pandas/tests/indexes/timedeltas/test_fillna.py new file mode 100644 index 0000000000000..47b2f2ff597f4 --- /dev/null +++ b/pandas/tests/indexes/timedeltas/test_fillna.py @@ -0,0 +1,17 @@ +from pandas import Index, NaT, Timedelta, TimedeltaIndex +import pandas._testing as tm + + +class TestFillNA: + def test_fillna_timedelta(self): + # GH#11343 + idx = TimedeltaIndex(["1 day", NaT, "3 day"]) + + exp = TimedeltaIndex(["1 day", "2 day", "3 day"]) + tm.assert_index_equal(idx.fillna(Timedelta("2 day")), exp) + + exp = TimedeltaIndex(["1 day", "3 hour", "3 day"]) + idx.fillna(Timedelta("3 hour")) + + exp = Index([Timedelta("1 day"), "x", Timedelta("3 day")], dtype=object) + tm.assert_index_equal(idx.fillna("x"), exp) diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index 14fff6f9c85b5..8c39a9c40a69b 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -18,6 +18,11 @@ def test_ellipsis(self): assert result.equals(idx) assert result is not idx + def test_getitem_slice_keeps_name(self): + # GH#4226 + tdi = timedelta_range("1d", "5d", freq="H", name="timebucket") + assert tdi[1:].name == tdi.name + def test_getitem(self): idx1 = timedelta_range("1 day", "31 day", freq="D", name="idx") @@ -65,7 +70,81 @@ def test_timestamp_invalid_key(self, key): tdi.get_loc(key) +class TestGetLoc: + def test_get_loc(self): + idx = pd.to_timedelta(["0 days", "1 days", "2 days"]) + + for method in [None, "pad", "backfill", "nearest"]: + assert idx.get_loc(idx[1], method) == 1 + assert idx.get_loc(idx[1].to_pytimedelta(), method) == 1 + assert idx.get_loc(str(idx[1]), method) == 1 + + assert idx.get_loc(idx[1], "pad", tolerance=Timedelta(0)) == 1 + assert idx.get_loc(idx[1], "pad", tolerance=np.timedelta64(0, "s")) == 1 + assert idx.get_loc(idx[1], "pad", tolerance=timedelta(0)) == 1 + + with pytest.raises(ValueError, match="unit abbreviation w/o a number"): + idx.get_loc(idx[1], method="nearest", tolerance="foo") + + with pytest.raises(ValueError, match="tolerance size must match"): + idx.get_loc( + idx[1], + method="nearest", + tolerance=[ + Timedelta(0).to_timedelta64(), + Timedelta(0).to_timedelta64(), + ], + ) + + for method, loc in [("pad", 1), ("backfill", 2), ("nearest", 1)]: + assert idx.get_loc("1 day 1 hour", method) == loc + + # GH 16909 + assert idx.get_loc(idx[1].to_timedelta64()) == 1 + + # GH 16896 + assert idx.get_loc("0 days") == 0 + + def test_get_loc_nat(self): + tidx = TimedeltaIndex(["1 days 01:00:00", "NaT", "2 days 01:00:00"]) + + assert tidx.get_loc(pd.NaT) == 1 + assert tidx.get_loc(None) == 1 + assert tidx.get_loc(float("nan")) == 1 + assert tidx.get_loc(np.nan) == 1 + + +class TestGetIndexer: + def test_get_indexer(self): + idx = pd.to_timedelta(["0 days", "1 days", "2 days"]) + tm.assert_numpy_array_equal( + idx.get_indexer(idx), np.array([0, 1, 2], dtype=np.intp) + ) + + target = pd.to_timedelta(["-1 hour", "12 hours", "1 day 1 hour"]) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "pad"), np.array([-1, 0, 1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "backfill"), np.array([0, 1, 2], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest"), np.array([0, 1, 1], dtype=np.intp) + ) + + res = idx.get_indexer(target, "nearest", tolerance=Timedelta("1 hour")) + tm.assert_numpy_array_equal(res, np.array([0, -1, 1], dtype=np.intp)) + + class TestWhere: + def test_where_doesnt_retain_freq(self): + tdi = timedelta_range("1 day", periods=3, freq="D", name="idx") + cond = [True, True, False] + expected = TimedeltaIndex([tdi[0], tdi[1], tdi[0]], freq=None, name="idx") + + result = tdi.where(cond, tdi[::-1]) + tm.assert_index_equal(result, expected) + def test_where_invalid_dtypes(self): tdi = timedelta_range("1 day", periods=3, freq="D", name="idx") @@ -176,217 +255,6 @@ def test_take_fill_value(self): with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) - with pytest.raises(IndexError): + msg = "index -5 is out of bounds for (axis 0 with )?size 3" + with pytest.raises(IndexError, match=msg): idx.take(np.array([1, -5])) - - -class TestTimedeltaIndex: - def test_insert_empty(self): - # Corner case inserting with length zero doesnt raise IndexError - idx = timedelta_range("1 Day", periods=3) - td = idx[0] - - idx[:0].insert(0, td) - idx[:0].insert(1, td) - idx[:0].insert(-1, td) - - def test_insert(self): - - idx = TimedeltaIndex(["4day", "1day", "2day"], name="idx") - - result = idx.insert(2, timedelta(days=5)) - exp = TimedeltaIndex(["4day", "1day", "5day", "2day"], name="idx") - tm.assert_index_equal(result, exp) - - # insertion of non-datetime should coerce to object index - result = idx.insert(1, "inserted") - expected = Index( - [Timedelta("4day"), "inserted", Timedelta("1day"), Timedelta("2day")], - name="idx", - ) - assert not isinstance(result, TimedeltaIndex) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - - idx = timedelta_range("1day 00:00:01", periods=3, freq="s", name="idx") - - # preserve freq - expected_0 = TimedeltaIndex( - ["1day", "1day 00:00:01", "1day 00:00:02", "1day 00:00:03"], - name="idx", - freq="s", - ) - expected_3 = TimedeltaIndex( - ["1day 00:00:01", "1day 00:00:02", "1day 00:00:03", "1day 00:00:04"], - name="idx", - freq="s", - ) - - # reset freq to None - expected_1_nofreq = TimedeltaIndex( - ["1day 00:00:01", "1day 00:00:01", "1day 00:00:02", "1day 00:00:03"], - name="idx", - freq=None, - ) - expected_3_nofreq = TimedeltaIndex( - ["1day 00:00:01", "1day 00:00:02", "1day 00:00:03", "1day 00:00:05"], - name="idx", - freq=None, - ) - - cases = [ - (0, Timedelta("1day"), expected_0), - (-3, Timedelta("1day"), expected_0), - (3, Timedelta("1day 00:00:04"), expected_3), - (1, Timedelta("1day 00:00:01"), expected_1_nofreq), - (3, Timedelta("1day 00:00:05"), expected_3_nofreq), - ] - - for n, d, expected in cases: - result = idx.insert(n, d) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - - @pytest.mark.parametrize( - "null", [None, np.nan, np.timedelta64("NaT"), pd.NaT, pd.NA] - ) - def test_insert_nat(self, null): - # GH 18295 (test missing) - idx = timedelta_range("1day", "3day") - result = idx.insert(1, null) - expected = TimedeltaIndex(["1day", pd.NaT, "2day", "3day"]) - tm.assert_index_equal(result, expected) - - def test_insert_invalid_na(self): - idx = TimedeltaIndex(["4day", "1day", "2day"], name="idx") - with pytest.raises(TypeError, match="incompatible label"): - idx.insert(0, np.datetime64("NaT")) - - def test_insert_dont_cast_strings(self): - # To match DatetimeIndex and PeriodIndex behavior, dont try to - # parse strings to Timedelta - idx = timedelta_range("1day", "3day") - - result = idx.insert(0, "1 Day") - assert result.dtype == object - assert result[0] == "1 Day" - - def test_delete(self): - idx = timedelta_range(start="1 Days", periods=5, freq="D", name="idx") - - # preserve freq - expected_0 = timedelta_range(start="2 Days", periods=4, freq="D", name="idx") - expected_4 = timedelta_range(start="1 Days", periods=4, freq="D", name="idx") - - # reset freq to None - expected_1 = TimedeltaIndex( - ["1 day", "3 day", "4 day", "5 day"], freq=None, name="idx" - ) - - cases = { - 0: expected_0, - -5: expected_0, - -1: expected_4, - 4: expected_4, - 1: expected_1, - } - for n, expected in cases.items(): - result = idx.delete(n) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - - with pytest.raises((IndexError, ValueError)): - # either depending on numpy version - idx.delete(5) - - def test_delete_slice(self): - idx = timedelta_range(start="1 days", periods=10, freq="D", name="idx") - - # preserve freq - expected_0_2 = timedelta_range(start="4 days", periods=7, freq="D", name="idx") - expected_7_9 = timedelta_range(start="1 days", periods=7, freq="D", name="idx") - - # reset freq to None - expected_3_5 = TimedeltaIndex( - ["1 d", "2 d", "3 d", "7 d", "8 d", "9 d", "10d"], freq=None, name="idx" - ) - - cases = { - (0, 1, 2): expected_0_2, - (7, 8, 9): expected_7_9, - (3, 4, 5): expected_3_5, - } - for n, expected in cases.items(): - result = idx.delete(n) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - - result = idx.delete(slice(n[0], n[-1] + 1)) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - - def test_get_loc(self): - idx = pd.to_timedelta(["0 days", "1 days", "2 days"]) - - for method in [None, "pad", "backfill", "nearest"]: - assert idx.get_loc(idx[1], method) == 1 - assert idx.get_loc(idx[1].to_pytimedelta(), method) == 1 - assert idx.get_loc(str(idx[1]), method) == 1 - - assert idx.get_loc(idx[1], "pad", tolerance=Timedelta(0)) == 1 - assert idx.get_loc(idx[1], "pad", tolerance=np.timedelta64(0, "s")) == 1 - assert idx.get_loc(idx[1], "pad", tolerance=timedelta(0)) == 1 - - with pytest.raises(ValueError, match="unit abbreviation w/o a number"): - idx.get_loc(idx[1], method="nearest", tolerance="foo") - - with pytest.raises(ValueError, match="tolerance size must match"): - idx.get_loc( - idx[1], - method="nearest", - tolerance=[ - Timedelta(0).to_timedelta64(), - Timedelta(0).to_timedelta64(), - ], - ) - - for method, loc in [("pad", 1), ("backfill", 2), ("nearest", 1)]: - assert idx.get_loc("1 day 1 hour", method) == loc - - # GH 16909 - assert idx.get_loc(idx[1].to_timedelta64()) == 1 - - # GH 16896 - assert idx.get_loc("0 days") == 0 - - def test_get_loc_nat(self): - tidx = TimedeltaIndex(["1 days 01:00:00", "NaT", "2 days 01:00:00"]) - - assert tidx.get_loc(pd.NaT) == 1 - assert tidx.get_loc(None) == 1 - assert tidx.get_loc(float("nan")) == 1 - assert tidx.get_loc(np.nan) == 1 - - def test_get_indexer(self): - idx = pd.to_timedelta(["0 days", "1 days", "2 days"]) - tm.assert_numpy_array_equal( - idx.get_indexer(idx), np.array([0, 1, 2], dtype=np.intp) - ) - - target = pd.to_timedelta(["-1 hour", "12 hours", "1 day 1 hour"]) - tm.assert_numpy_array_equal( - idx.get_indexer(target, "pad"), np.array([-1, 0, 1], dtype=np.intp) - ) - tm.assert_numpy_array_equal( - idx.get_indexer(target, "backfill"), np.array([0, 1, 2], dtype=np.intp) - ) - tm.assert_numpy_array_equal( - idx.get_indexer(target, "nearest"), np.array([0, 1, 1], dtype=np.intp) - ) - - res = idx.get_indexer(target, "nearest", tolerance=Timedelta("1 hour")) - tm.assert_numpy_array_equal(res, np.array([0, -1, 1], dtype=np.intp)) diff --git a/pandas/tests/indexes/timedeltas/test_insert.py b/pandas/tests/indexes/timedeltas/test_insert.py new file mode 100644 index 0000000000000..b214e009db869 --- /dev/null +++ b/pandas/tests/indexes/timedeltas/test_insert.py @@ -0,0 +1,101 @@ +from datetime import timedelta + +import numpy as np +import pytest + +import pandas as pd +from pandas import Index, Timedelta, TimedeltaIndex, timedelta_range +import pandas._testing as tm + + +class TestTimedeltaIndexInsert: + def test_insert(self): + + idx = TimedeltaIndex(["4day", "1day", "2day"], name="idx") + + result = idx.insert(2, timedelta(days=5)) + exp = TimedeltaIndex(["4day", "1day", "5day", "2day"], name="idx") + tm.assert_index_equal(result, exp) + + # insertion of non-datetime should coerce to object index + result = idx.insert(1, "inserted") + expected = Index( + [Timedelta("4day"), "inserted", Timedelta("1day"), Timedelta("2day")], + name="idx", + ) + assert not isinstance(result, TimedeltaIndex) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + + idx = timedelta_range("1day 00:00:01", periods=3, freq="s", name="idx") + + # preserve freq + expected_0 = TimedeltaIndex( + ["1day", "1day 00:00:01", "1day 00:00:02", "1day 00:00:03"], + name="idx", + freq="s", + ) + expected_3 = TimedeltaIndex( + ["1day 00:00:01", "1day 00:00:02", "1day 00:00:03", "1day 00:00:04"], + name="idx", + freq="s", + ) + + # reset freq to None + expected_1_nofreq = TimedeltaIndex( + ["1day 00:00:01", "1day 00:00:01", "1day 00:00:02", "1day 00:00:03"], + name="idx", + freq=None, + ) + expected_3_nofreq = TimedeltaIndex( + ["1day 00:00:01", "1day 00:00:02", "1day 00:00:03", "1day 00:00:05"], + name="idx", + freq=None, + ) + + cases = [ + (0, Timedelta("1day"), expected_0), + (-3, Timedelta("1day"), expected_0), + (3, Timedelta("1day 00:00:04"), expected_3), + (1, Timedelta("1day 00:00:01"), expected_1_nofreq), + (3, Timedelta("1day 00:00:05"), expected_3_nofreq), + ] + + for n, d, expected in cases: + result = idx.insert(n, d) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + @pytest.mark.parametrize( + "null", [None, np.nan, np.timedelta64("NaT"), pd.NaT, pd.NA] + ) + def test_insert_nat(self, null): + # GH 18295 (test missing) + idx = timedelta_range("1day", "3day") + result = idx.insert(1, null) + expected = TimedeltaIndex(["1day", pd.NaT, "2day", "3day"]) + tm.assert_index_equal(result, expected) + + def test_insert_invalid_na(self): + idx = TimedeltaIndex(["4day", "1day", "2day"], name="idx") + with pytest.raises(TypeError, match="incompatible label"): + idx.insert(0, np.datetime64("NaT")) + + def test_insert_dont_cast_strings(self): + # To match DatetimeIndex and PeriodIndex behavior, dont try to + # parse strings to Timedelta + idx = timedelta_range("1day", "3day") + + result = idx.insert(0, "1 Day") + assert result.dtype == object + assert result[0] == "1 Day" + + def test_insert_empty(self): + # Corner case inserting with length zero doesnt raise IndexError + idx = timedelta_range("1 Day", periods=3) + td = idx[0] + + idx[:0].insert(0, td) + idx[:0].insert(1, td) + idx[:0].insert(-1, td) diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index 6606507dabc29..aa1bf997fc66b 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -3,13 +3,11 @@ import numpy as np import pytest -from pandas.core.dtypes.generic import ABCDateOffset - import pandas as pd from pandas import Series, TimedeltaIndex, timedelta_range import pandas._testing as tm -from pandas.tseries.offsets import Day, Hour +from pandas.tseries.offsets import DateOffset, Day, Hour class TestTimedeltaIndexOps: @@ -136,9 +134,9 @@ def test_order(self): tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) assert ordered.freq is None - def test_drop_duplicates_metadata(self): + def test_drop_duplicates_metadata(self, freq_sample): # GH 10115 - idx = pd.timedelta_range("1 day", "31 day", freq="D", name="idx") + idx = pd.timedelta_range("1 day", periods=10, freq=freq_sample, name="idx") result = idx.drop_duplicates() tm.assert_index_equal(idx, result) assert idx.freq == result.freq @@ -149,36 +147,38 @@ def test_drop_duplicates_metadata(self): tm.assert_index_equal(idx, result) assert result.freq is None - def test_drop_duplicates(self): + @pytest.mark.parametrize( + "keep, expected, index", + [ + ("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)), + ("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)), + ( + False, + np.concatenate(([True] * 5, [False] * 5, [True] * 5)), + np.arange(5, 10), + ), + ], + ) + def test_drop_duplicates(self, freq_sample, keep, expected, index): # to check Index/Series compat - base = pd.timedelta_range("1 day", "31 day", freq="D", name="idx") - idx = base.append(base[:5]) + idx = pd.timedelta_range("1 day", periods=10, freq=freq_sample, name="idx") + idx = idx.append(idx[:5]) - res = idx.drop_duplicates() - tm.assert_index_equal(res, base) - res = Series(idx).drop_duplicates() - tm.assert_series_equal(res, Series(base)) + tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected) + expected = idx[~expected] - res = idx.drop_duplicates(keep="last") - exp = base[5:].append(base[:5]) - tm.assert_index_equal(res, exp) - res = Series(idx).drop_duplicates(keep="last") - tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) + result = idx.drop_duplicates(keep=keep) + tm.assert_index_equal(result, expected) - res = idx.drop_duplicates(keep=False) - tm.assert_index_equal(res, base[5:]) - res = Series(idx).drop_duplicates(keep=False) - tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + result = Series(idx).drop_duplicates(keep=keep) + tm.assert_series_equal(result, Series(expected, index=index)) - @pytest.mark.parametrize( - "freq", ["D", "3D", "-3D", "H", "2H", "-2H", "T", "2T", "S", "-3S"] - ) - def test_infer_freq(self, freq): + def test_infer_freq(self, freq_sample): # GH#11018 - idx = pd.timedelta_range("1", freq=freq, periods=10) + idx = pd.timedelta_range("1", freq=freq_sample, periods=10) result = pd.TimedeltaIndex(idx.asi8, freq="infer") tm.assert_index_equal(idx, result) - assert result.freq == freq + assert result.freq == freq_sample def test_repeat(self): index = pd.timedelta_range("1 days", periods=2, freq="D") @@ -263,7 +263,7 @@ def test_freq_setter(self, values, freq): # can set to an offset, converting from string if necessary idx._data.freq = freq assert idx.freq == freq - assert isinstance(idx.freq, ABCDateOffset) + assert isinstance(idx.freq, DateOffset) # can reset to None idx._data.freq = None diff --git a/pandas/tests/indexes/timedeltas/test_partial_slicing.py b/pandas/tests/indexes/timedeltas/test_partial_slicing.py index 29e2c7dd20be0..a0ef953db3600 100644 --- a/pandas/tests/indexes/timedeltas/test_partial_slicing.py +++ b/pandas/tests/indexes/timedeltas/test_partial_slicing.py @@ -7,11 +7,6 @@ class TestSlicing: - def test_slice_keeps_name(self): - # GH4226 - dr = pd.timedelta_range("1d", "5d", freq="H", name="timebucket") - assert dr[1:].name == dr.name - def test_partial_slice(self): rng = timedelta_range("1 day 10:11:12", freq="h", periods=500) s = Series(np.arange(len(rng)), index=rng) diff --git a/pandas/tests/indexes/timedeltas/test_scalar_compat.py b/pandas/tests/indexes/timedeltas/test_scalar_compat.py index 44f4a2adedaad..1b86cd1df5a7a 100644 --- a/pandas/tests/indexes/timedeltas/test_scalar_compat.py +++ b/pandas/tests/indexes/timedeltas/test_scalar_compat.py @@ -69,3 +69,67 @@ def test_tdi_round(self): td.round(freq="M") with pytest.raises(ValueError, match=msg): elt.round(freq="M") + + @pytest.mark.parametrize( + "freq,msg", + [ + ("Y", " is a non-fixed frequency"), + ("M", " is a non-fixed frequency"), + ("foobar", "Invalid frequency: foobar"), + ], + ) + def test_tdi_round_invalid(self, freq, msg): + t1 = timedelta_range("1 days", periods=3, freq="1 min 2 s 3 us") + + with pytest.raises(ValueError, match=msg): + t1.round(freq) + with pytest.raises(ValueError, match=msg): + # Same test for TimedeltaArray + t1._data.round(freq) + + # TODO: de-duplicate with test_tdi_round + def test_round(self): + t1 = timedelta_range("1 days", periods=3, freq="1 min 2 s 3 us") + t2 = -1 * t1 + t1a = timedelta_range("1 days", periods=3, freq="1 min 2 s") + t1c = TimedeltaIndex([1, 1, 1], unit="D") + + # note that negative times round DOWN! so don't give whole numbers + for (freq, s1, s2) in [ + ("N", t1, t2), + ("U", t1, t2), + ( + "L", + t1a, + TimedeltaIndex( + ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"], + ), + ), + ( + "S", + t1a, + TimedeltaIndex( + ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"], + ), + ), + ("12T", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"],),), + ("H", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"],),), + ("d", t1c, TimedeltaIndex([-1, -1, -1], unit="D")), + ]: + + r1 = t1.round(freq) + tm.assert_index_equal(r1, s1) + r2 = t2.round(freq) + tm.assert_index_equal(r2, s2) + + def test_components(self): + rng = timedelta_range("1 days, 10:11:12", periods=2, freq="s") + rng.components + + # with nat + s = Series(rng) + s[1] = np.nan + + result = s.dt.components + assert not result.iloc[0].isna().all() + assert result.iloc[1].isna().all() diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index 0aa784cbb7710..4808950f17b52 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -107,7 +107,6 @@ def test_intersection_bug_1708(self): expected = timedelta_range("1 day 01:00:00", periods=3, freq="h") tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("sort", [None, False]) def test_intersection_equal(self, sort): # GH 24471 Test intersection outcome given the sort keyword # for equal indicies intersection should return the original index @@ -123,7 +122,6 @@ def test_intersection_equal(self, sort): assert inter is first @pytest.mark.parametrize("period_1, period_2", [(0, 4), (4, 0)]) - @pytest.mark.parametrize("sort", [None, False]) def test_intersection_zero_length(self, period_1, period_2, sort): # GH 24471 test for non overlap the intersection should be zero length index_1 = timedelta_range("1 day", periods=period_1, freq="h") @@ -132,7 +130,6 @@ def test_intersection_zero_length(self, period_1, period_2, sort): result = index_1.intersection(index_2, sort=sort) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("sort", [None, False]) def test_zero_length_input_index(self, sort): # GH 24966 test for 0-len intersections are copied index_1 = timedelta_range("1 day", periods=0, freq="h") @@ -162,7 +159,6 @@ def test_zero_length_input_index(self, sort): ), ], ) - @pytest.mark.parametrize("sort", [None, False]) def test_intersection(self, rng, expected, sort): # GH 4690 (with tz) base = timedelta_range("1 day", periods=4, freq="h", name="idx") @@ -195,7 +191,6 @@ def test_intersection(self, rng, expected, sort): ), ], ) - @pytest.mark.parametrize("sort", [None, False]) def test_intersection_non_monotonic(self, rng, expected, sort): # 24471 non-monotonic base = TimedeltaIndex(["1 hour", "2 hour", "4 hour", "3 hour"], name="idx") @@ -213,7 +208,6 @@ def test_intersection_non_monotonic(self, rng, expected, sort): class TestTimedeltaIndexDifference: - @pytest.mark.parametrize("sort", [None, False]) def test_difference_freq(self, sort): # GH14323: Difference of TimedeltaIndex should not preserve frequency @@ -231,7 +225,6 @@ def test_difference_freq(self, sort): tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) - @pytest.mark.parametrize("sort", [None, False]) def test_difference_sort(self, sort): index = pd.TimedeltaIndex( diff --git a/pandas/tests/indexes/timedeltas/test_shift.py b/pandas/tests/indexes/timedeltas/test_shift.py index 98933ff0423ab..c02aa71d97aac 100644 --- a/pandas/tests/indexes/timedeltas/test_shift.py +++ b/pandas/tests/indexes/timedeltas/test_shift.py @@ -71,5 +71,5 @@ def test_tdi_shift_nonstandard_freq(self): def test_shift_no_freq(self): # GH#19147 tdi = TimedeltaIndex(["1 days 01:00:00", "2 days 01:00:00"], freq=None) - with pytest.raises(NullFrequencyError): + with pytest.raises(NullFrequencyError, match="Cannot shift with no freq"): tdi.shift(2) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index d4a94f8693081..129bdef870a14 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -11,6 +11,7 @@ Series, Timedelta, TimedeltaIndex, + array, date_range, timedelta_range, ) @@ -28,7 +29,7 @@ class TestTimedeltaIndex(DatetimeLike): def indices(self): return tm.makeTimedeltaIndex(10) - def create_index(self): + def create_index(self) -> TimedeltaIndex: return pd.to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) def test_numeric_compat(self): @@ -42,21 +43,6 @@ def test_shift(self): def test_pickle_compat_construction(self): pass - def test_fillna_timedelta(self): - # GH 11343 - idx = pd.TimedeltaIndex(["1 day", pd.NaT, "3 day"]) - - exp = pd.TimedeltaIndex(["1 day", "2 day", "3 day"]) - tm.assert_index_equal(idx.fillna(pd.Timedelta("2 day")), exp) - - exp = pd.TimedeltaIndex(["1 day", "3 hour", "3 day"]) - idx.fillna(pd.Timedelta("3 hour")) - - exp = pd.Index( - [pd.Timedelta("1 day"), "x", pd.Timedelta("3 day")], dtype=object - ) - tm.assert_index_equal(idx.fillna("x"), exp) - def test_isin(self): index = tm.makeTimedeltaIndex(4) @@ -111,6 +97,26 @@ def test_sort_values(self): tm.assert_numpy_array_equal(dexer, np.array([0, 2, 1]), check_dtype=False) + @pytest.mark.parametrize("klass", [list, np.array, array, Series]) + def test_searchsorted_different_argument_classes(self, klass): + idx = TimedeltaIndex(["1 day", "2 days", "3 days"]) + result = idx.searchsorted(klass(idx)) + expected = np.arange(len(idx), dtype=result.dtype) + tm.assert_numpy_array_equal(result, expected) + + result = idx._data.searchsorted(klass(idx)) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "arg", + [[1, 2], ["a", "b"], [pd.Timestamp("2020-01-01", tz="Europe/London")] * 2], + ) + def test_searchsorted_invalid_argument_dtype(self, arg): + idx = TimedeltaIndex(["1 day", "2 days", "3 days"]) + msg = "searchsorted requires compatible dtype" + with pytest.raises(TypeError, match=msg): + idx.searchsorted(arg) + def test_argmin_argmax(self): idx = TimedeltaIndex(["1 day 00:00:05", "1 day 00:00:01", "1 day 00:00:02"]) assert idx.argmin() == 1 @@ -147,19 +153,6 @@ def test_pass_TimedeltaIndex_to_index(self): tm.assert_numpy_array_equal(idx.values, expected.values) - def test_pickle(self): - - rng = timedelta_range("1 days", periods=10) - rng_p = tm.round_trip_pickle(rng) - tm.assert_index_equal(rng, rng_p) - - def test_hash_error(self): - index = timedelta_range("1 days", periods=10) - with pytest.raises( - TypeError, match=(f"unhashable type: {repr(type(index).__name__)}") - ): - hash(index) - def test_append_numpy_bug_1681(self): td = timedelta_range("1 days", "10 days", freq="2D") @@ -170,13 +163,6 @@ def test_append_numpy_bug_1681(self): result = a.append(c) assert (result["B"] == td).all() - def test_delete_doesnt_infer_freq(self): - # GH#30655 behavior matches DatetimeIndex - - tdi = pd.TimedeltaIndex(["1 Day", "2 Days", None, "3 Days", "4 Days"]) - result = tdi.delete(2) - assert result.freq is None - def test_fields(self): rng = timedelta_range("1 days, 10:11:12.100123456", periods=2, freq="s") tm.assert_index_equal(rng.days, Index([1, 1], dtype="int64")) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/pandas/tests/indexes/timedeltas/test_timedelta_range.py index 9f12af9a96104..c07a6471c732f 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta_range.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta_range.py @@ -38,6 +38,7 @@ def test_linspace_behavior(self, periods, freq): result = timedelta_range(start="0 days", end="4 days", periods=periods) expected = timedelta_range(start="0 days", end="4 days", freq=freq) tm.assert_index_equal(result, expected) + assert result.freq == freq def test_errors(self): # not enough params diff --git a/pandas/tests/indexing/interval/test_interval_new.py b/pandas/tests/indexing/interval/test_interval_new.py index 43036fbbd9844..03c3034772bc6 100644 --- a/pandas/tests/indexing/interval/test_interval_new.py +++ b/pandas/tests/indexing/interval/test_interval_new.py @@ -128,9 +128,6 @@ def test_loc_with_slices(self): with pytest.raises(NotImplementedError, match=msg): s[Interval(3, 4, closed="left") :] - # TODO with non-existing intervals ? - # s.loc[Interval(-1, 0):Interval(2, 3)] - # slice of scalar expected = s.iloc[:3] @@ -143,9 +140,32 @@ def test_loc_with_slices(self): tm.assert_series_equal(expected, s[:2.5]) tm.assert_series_equal(expected, s[0.1:2.5]) - # slice of scalar with step != 1 - with pytest.raises(ValueError): - s[0:4:2] + def test_slice_step_ne1(self): + # GH#31658 slice of scalar with step != 1 + s = self.s + expected = s.iloc[0:4:2] + + result = s[0:4:2] + tm.assert_series_equal(result, expected) + + result2 = s[0:4][::2] + tm.assert_series_equal(result2, expected) + + def test_slice_float_start_stop(self): + # GH#31658 slicing with integers is positional, with floats is not + # supported + ser = Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6))) + + msg = "label-based slicing with step!=1 is not supported for IntervalIndex" + with pytest.raises(ValueError, match=msg): + ser[1.5:9.5:2] + + def test_slice_interval_step(self): + # GH#31658 allows for integer step!=1, not Interval step + s = self.s + msg = "label-based slicing with step!=1 is not supported for IntervalIndex" + with pytest.raises(ValueError, match=msg): + s[0 : 4 : Interval(0, 1)] def test_loc_with_overlap(self): diff --git a/pandas/tests/indexing/multiindex/conftest.py b/pandas/tests/indexing/multiindex/conftest.py index 0256f5e35e1db..c69d6f86a6ce6 100644 --- a/pandas/tests/indexing/multiindex/conftest.py +++ b/pandas/tests/indexing/multiindex/conftest.py @@ -2,7 +2,6 @@ import pytest from pandas import DataFrame, Index, MultiIndex -import pandas._testing as tm @pytest.fixture @@ -16,17 +15,3 @@ def multiindex_dataframe_random_data(): return DataFrame( np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp") ) - - -@pytest.fixture -def multiindex_year_month_day_dataframe_random_data(): - """ - DataFrame with 3 level MultiIndex (year, month, day) covering - first 100 business days from 2000-01-01 with random data - """ - tdf = tm.makeTimeDataFrame(100) - ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() - # use Int64Index, to make sure things work - ymd.index.set_levels([lev.astype("i8") for lev in ymd.index.levels], inplace=True) - ymd.index.set_names(["year", "month", "day"], inplace=True) - return ymd diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index 7e75b5324445e..54b22dbc53466 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -87,8 +87,8 @@ def test_series_getitem_returns_scalar( (lambda s: s[(2000, 3, 4)], KeyError, r"^\(2000, 3, 4\)$"), (lambda s: s.loc[(2000, 3, 4)], KeyError, r"^\(2000, 3, 4\)$"), (lambda s: s.loc[(2000, 3, 4, 5)], IndexingError, "Too many indexers"), - (lambda s: s.__getitem__(len(s)), IndexError, "is out of bounds"), - (lambda s: s[len(s)], IndexError, "is out of bounds"), + (lambda s: s.__getitem__(len(s)), KeyError, ""), # match should include len(s) + (lambda s: s[len(s)], KeyError, ""), # match should include len(s) ( lambda s: s.iloc[len(s)], IndexError, diff --git a/pandas/tests/indexing/multiindex/test_indexing_slow.py b/pandas/tests/indexing/multiindex/test_indexing_slow.py index 8ea1cebd7bf7b..ea4453b8dd6eb 100644 --- a/pandas/tests/indexing/multiindex/test_indexing_slow.py +++ b/pandas/tests/indexing/multiindex/test_indexing_slow.py @@ -4,7 +4,7 @@ import pytest import pandas as pd -from pandas import DataFrame, MultiIndex, Series +from pandas import DataFrame, Series import pandas._testing as tm @@ -83,10 +83,3 @@ def loop(mi, df, keys): mi = df.set_index(cols[:-1]) assert not mi.index.lexsort_depth < i loop(mi, df, keys) - - -@pytest.mark.slow -def test_large_mi_dataframe_indexing(): - # GH10645 - result = MultiIndex.from_arrays([range(10 ** 6), range(10 ** 6)]) - assert not (10 ** 6, 0) in result diff --git a/pandas/tests/indexing/multiindex/test_insert.py b/pandas/tests/indexing/multiindex/test_insert.py index 835e61da2fb3e..42922c3deeee4 100644 --- a/pandas/tests/indexing/multiindex/test_insert.py +++ b/pandas/tests/indexing/multiindex/test_insert.py @@ -5,7 +5,7 @@ class TestMultiIndexInsertion: - def test_mixed_depth_insert(self): + def test_setitem_mixed_depth(self): arrays = [ ["a", "top", "top", "routine1", "routine1", "routine2"], ["", "OD", "OD", "result1", "result2", "result1"], diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 0064187a94265..5e5fcd3db88d8 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -26,26 +26,6 @@ def test_multiindex_perf_warn(self): with tm.assert_produces_warning(PerformanceWarning): df.loc[(0,)] - def test_multiindex_contains_dropped(self): - # GH 19027 - # test that dropped MultiIndex levels are not in the MultiIndex - # despite continuing to be in the MultiIndex's levels - idx = MultiIndex.from_product([[1, 2], [3, 4]]) - assert 2 in idx - idx = idx.drop(2) - - # drop implementation keeps 2 in the levels - assert 2 in idx.levels[0] - # but it should no longer be in the index itself - assert 2 not in idx - - # also applies to strings - idx = MultiIndex.from_product([["a", "b"], ["c", "d"]]) - assert "a" in idx - idx = idx.drop("a") - assert "a" in idx.levels[0] - assert "a" not in idx - def test_indexing_over_hashtable_size_cutoff(self): n = 10000 @@ -85,14 +65,6 @@ def test_multi_nan_indexing(self): ) tm.assert_frame_equal(result, expected) - def test_contains(self): - # GH 24570 - tx = pd.timedelta_range("09:30:00", "16:00:00", freq="30 min") - idx = MultiIndex.from_arrays([tx, np.arange(len(tx))]) - assert tx[0] in idx - assert "element_not_exit" not in idx - assert "0 day 09:30:00" in idx - def test_nested_tuples_duplicates(self): # GH#30892 diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py index 9d181bdcb9491..ed11af8ef68ad 100644 --- a/pandas/tests/indexing/multiindex/test_partial.py +++ b/pandas/tests/indexing/multiindex/test_partial.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas import DataFrame, MultiIndex +from pandas import DataFrame, Float64Index, Int64Index, MultiIndex import pandas._testing as tm @@ -126,7 +126,32 @@ def test_partial_set(self, multiindex_year_month_day_dataframe_random_data): # this works...for now df["A"].iloc[14] = 5 - assert df["A"][14] == 5 + assert df["A"].iloc[14] == 5 + + @pytest.mark.parametrize("dtype", [int, float]) + def test_getitem_intkey_leading_level( + self, multiindex_year_month_day_dataframe_random_data, dtype + ): + # GH#33355 dont fall-back to positional when leading level is int + ymd = multiindex_year_month_day_dataframe_random_data + levels = ymd.index.levels + ymd.index = ymd.index.set_levels([levels[0].astype(dtype)] + levels[1:]) + ser = ymd["A"] + mi = ser.index + assert isinstance(mi, MultiIndex) + if dtype is int: + assert isinstance(mi.levels[0], Int64Index) + else: + assert isinstance(mi.levels[0], Float64Index) + + assert 14 not in mi.levels[0] + assert not mi.levels[0]._should_fallback_to_positional() + assert not mi._should_fallback_to_positional() + + with pytest.raises(KeyError, match="14"): + ser[14] + with pytest.raises(KeyError, match="14"): + mi.get_value(ser, 14) # --------------------------------------------------------------------- # AMBIGUOUS CASES! @@ -140,7 +165,7 @@ def test_partial_loc_missing(self, multiindex_year_month_day_dataframe_random_da tm.assert_series_equal(result, expected) # need to put in some work here - + # FIXME: dont leave commented-out # self.ymd.loc[2000, 0] = 0 # assert (self.ymd.loc[2000]['A'] == 0).all() diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 1e641760f7e8d..853b92ea91274 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -137,7 +137,8 @@ def test_multiindex_setitem(self): tm.assert_frame_equal(df.loc[["bar"]], expected) # raise because these have differing levels - with pytest.raises(TypeError): + msg = "cannot align on a multi-index with out specifying the join levels" + with pytest.raises(TypeError, match=msg): df.loc["bar"] *= 2 # from SO @@ -203,10 +204,14 @@ def test_multiindex_assignment(self): tm.assert_series_equal(df.loc[4, "c"], exp) # invalid assignments - with pytest.raises(ValueError): + msg = ( + "cannot set using a multi-index selection indexer " + "with a different length than the value" + ) + with pytest.raises(ValueError, match=msg): df.loc[4, "c"] = [0, 1, 2, 3] - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): df.loc[4, "c"] = [0] # groupby example @@ -231,6 +236,7 @@ def f(name, df2): f_index ) + # FIXME: dont leave commented-out # TODO(wesm): unused? # new_df = pd.concat([f(name, df2) for name, df2 in grp], axis=1).T @@ -250,7 +256,11 @@ def test_series_setitem(self, multiindex_year_month_day_dataframe_random_data): assert notna(s.values[65:]).all() s[2000, 3, 10] = np.nan - assert isna(s[49]) + assert isna(s.iloc[49]) + + with pytest.raises(KeyError, match="49"): + # GH#33355 dont fall-back to positional when leading level is int + s[49] def test_frame_getitem_setitem_boolean(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data diff --git a/pandas/tests/indexing/multiindex/test_slice.py b/pandas/tests/indexing/multiindex/test_slice.py index 6fa9d3bd2cdbb..f367a92d0b006 100644 --- a/pandas/tests/indexing/multiindex/test_slice.py +++ b/pandas/tests/indexing/multiindex/test_slice.py @@ -111,7 +111,11 @@ def test_per_axis_per_level_getitem(self): expected = df.iloc[[2, 3]] tm.assert_frame_equal(result, expected) - with pytest.raises(ValueError): + msg = ( + "cannot index with a boolean indexer " + "that is not the same length as the index" + ) + with pytest.raises(ValueError, match=msg): df.loc[(slice(None), np.array([True, False])), :] # ambiguous notation @@ -411,7 +415,11 @@ def test_per_axis_per_level_doc_examples(self): tm.assert_frame_equal(result, expected) # not sorted - with pytest.raises(UnsortedIndexError): + msg = ( + "MultiIndex slicing requires the index to be lexsorted: " + r"slicing on levels \[1\], lexsort depth 1" + ) + with pytest.raises(UnsortedIndexError, match=msg): df.loc["A1", ("a", slice("foo"))] # GH 16734: not sorted, but no real slicing @@ -480,14 +488,10 @@ def test_loc_axis_arguments(self): tm.assert_frame_equal(result, expected) # invalid axis - with pytest.raises(ValueError): - df.loc(axis=-1)[:, :, ["C1", "C3"]] - - with pytest.raises(ValueError): - df.loc(axis=2)[:, :, ["C1", "C3"]] - - with pytest.raises(ValueError): - df.loc(axis="foo")[:, :, ["C1", "C3"]] + for i in [-1, 2, "foo"]: + msg = f"No axis named {i} for object type DataFrame" + with pytest.raises(ValueError, match=msg): + df.loc(axis=i)[:, :, ["C1", "C3"]] def test_loc_axis_single_level_multi_col_indexing_multiindex_col_df(self): @@ -628,12 +632,14 @@ def test_per_axis_per_level_setitem(self): # not enough values df = df_orig.copy() - with pytest.raises(ValueError): + msg = "setting an array element with a sequence." + with pytest.raises(ValueError, match=msg): df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array( [[100], [100, 100]], dtype="int64" ) - with pytest.raises(ValueError): + msg = "Must have equal len keys and value when setting with an iterable" + with pytest.raises(ValueError, match=msg): df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array( [100, 100, 100, 100], dtype="int64" ) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 8a8ac584c16c2..829ee61197ff2 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -95,15 +95,6 @@ def test_getitem_scalar(self): result = s[cats[0]] assert result == expected - def test_slicing_directly(self): - cat = Categorical(["a", "b", "c", "d", "a", "b", "c"]) - sliced = cat[3] - assert sliced == "d" - sliced = cat[3:5] - expected = Categorical(["d", "a"], categories=["a", "b", "c", "d"]) - tm.assert_numpy_array_equal(sliced._codes, expected._codes) - tm.assert_index_equal(sliced.categories, expected.categories) - def test_slicing(self): cat = Series(Categorical([1, 2, 3, 4])) reversed = cat[::-1] @@ -778,9 +769,9 @@ def test_map_with_dict_or_series(self): pd.timedelta_range(start="1d", periods=3).array, ], ) - def test_loc_with_non_string_categories(self, idx_values, ordered_fixture): + def test_loc_with_non_string_categories(self, idx_values, ordered): # GH-17569 - cat_idx = CategoricalIndex(idx_values, ordered=ordered_fixture) + cat_idx = CategoricalIndex(idx_values, ordered=ordered) df = DataFrame({"A": ["foo", "bar", "baz"]}, index=cat_idx) sl = slice(idx_values[0], idx_values[1]) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index bea8eae9bb850..c390347236ad3 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -297,7 +297,8 @@ def test_setitem_index_object(self, val, exp_dtype): if exp_dtype is IndexError: temp = obj.copy() - with pytest.raises(exp_dtype): + msg = "index 5 is out of bounds for axis 0 with size 4" + with pytest.raises(exp_dtype, match=msg): temp[5] = 5 else: exp_index = pd.Index(list("abcd") + [val]) diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index 87520f5ab2577..18b9898e7d800 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -1,9 +1,17 @@ +import re + import numpy as np import pytest from pandas import DataFrame, Float64Index, Index, Int64Index, RangeIndex, Series import pandas._testing as tm +# We pass through the error message from numpy +_slice_iloc_msg = re.escape( + "only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) " + "and integer or boolean arrays are valid indices" +) + def gen_obj(klass, index): if klass is Series: @@ -32,20 +40,7 @@ def check(self, result, original, indexer, getitem): tm.assert_almost_equal(result, expected) - @pytest.mark.parametrize( - "index_func", - [ - tm.makeStringIndex, - tm.makeUnicodeIndex, - tm.makeCategoricalIndex, - tm.makeDateIndex, - tm.makeTimedeltaIndex, - tm.makePeriodIndex, - tm.makeIntIndex, - tm.makeRangeIndex, - ], - ) - def test_scalar_error(self, index_func): + def test_scalar_error(self, series_with_simple_index): # GH 4892 # float_indexers should raise exceptions @@ -54,19 +49,13 @@ def test_scalar_error(self, index_func): # but is specifically testing for the error # message - i = index_func(5) - - s = Series(np.arange(len(i)), index=i) + s = series_with_simple_index - msg = "Cannot index by location index" + msg = "Cannot index by location index with a non-integer key" with pytest.raises(TypeError, match=msg): s.iloc[3.0] - msg = ( - f"cannot do positional indexing on {type(i).__name__} with these " - r"indexers \[3\.0\] of type float" - ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(IndexError, match=_slice_iloc_msg): s.iloc[3.0] = 0 @pytest.mark.parametrize( @@ -91,54 +80,21 @@ def test_scalar_non_numeric(self, index_func, klass): s = gen_obj(klass, i) # getting - for idxr, getitem in [(lambda x: x.iloc, False), (lambda x: x, True)]: + with pytest.raises(KeyError, match="^3.0$"): + s[3.0] - # gettitem on a DataFrame is a KeyError as it is indexing - # via labels on the columns - if getitem and isinstance(s, DataFrame): - error = KeyError - msg = r"^3(\.0)?$" - else: - error = TypeError - msg = ( - r"cannot do (label|positional) indexing " - fr"on {type(i).__name__} with these indexers \[3\.0\] of " - r"type float|" - "Cannot index by location index with a " - "non-integer key" - ) - with pytest.raises(error, match=msg): - idxr(s)[3.0] - - # label based can be a TypeError or KeyError - if s.index.inferred_type in { - "categorical", - "string", - "unicode", - "mixed", - }: - error = KeyError - msg = r"^3\.0$" - else: - error = TypeError - msg = ( - r"cannot do (label|positional) indexing " - fr"on {type(i).__name__} with these indexers \[3\.0\] of " - "type float" - ) - with pytest.raises(error, match=msg): + msg = "Cannot index by location index with a non-integer key" + with pytest.raises(TypeError, match=msg): + s.iloc[3.0] + + with pytest.raises(KeyError, match="^3.0$"): s.loc[3.0] # contains assert 3.0 not in s # setting with a float fails with iloc - msg = ( - r"cannot do (label|positional) indexing " - fr"on {type(i).__name__} with these indexers \[3\.0\] of " - "type float" - ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(IndexError, match=_slice_iloc_msg): s.iloc[3.0] = 0 # setting with an indexer @@ -182,12 +138,7 @@ def test_scalar_non_numeric_series_fallback(self, index_func): i = index_func(5) s = Series(np.arange(len(i)), index=i) s[3] - msg = ( - r"cannot do (label|positional) indexing " - fr"on {type(i).__name__} with these indexers \[3\.0\] of " - "type float" - ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(KeyError, match="^3.0$"): s[3.0] def test_scalar_with_mixed(self): @@ -195,16 +146,12 @@ def test_scalar_with_mixed(self): s2 = Series([1, 2, 3], index=["a", "b", "c"]) s3 = Series([1, 2, 3], index=["a", "b", 1.5]) - # lookup in a pure stringstr - # with an invalid indexer - msg = ( - "cannot do label indexing " - fr"on {Index.__name__} with these indexers \[1\.0\] of " - r"type float|" - "Cannot index by location index with a non-integer key" - ) - with pytest.raises(TypeError, match=msg): + # lookup in a pure string index with an invalid indexer + + with pytest.raises(KeyError, match="^1.0$"): s2[1.0] + + msg = "Cannot index by location index with a non-integer key" with pytest.raises(TypeError, match=msg): s2.iloc[1.0] @@ -217,12 +164,7 @@ def test_scalar_with_mixed(self): # mixed index so we have label # indexing - msg = ( - "cannot do label indexing " - fr"on {Index.__name__} with these indexers \[1\.0\] of " - "type float" - ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(KeyError, match="^1.0$"): s3[1.0] result = s3[1] @@ -327,12 +269,7 @@ def test_scalar_float(self, klass): with pytest.raises(TypeError, match=msg): s.iloc[3.0] - msg = ( - "cannot do positional indexing " - fr"on {Float64Index.__name__} with these indexers \[3\.0\] of " - "type float" - ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(IndexError, match=_slice_iloc_msg): s2.iloc[3.0] = 0 @pytest.mark.parametrize( @@ -376,11 +313,7 @@ def test_slice_non_numeric(self, index_func, l, klass): idxr(s)[l] # setitem - msg = ( - "cannot do positional indexing " - fr"on {type(index).__name__} with these indexers \[(3|4)\.0\] of " - "type float" - ) + msg = "slice indices must be integers or None or have an __index__ method" with pytest.raises(TypeError, match=msg): s.iloc[l] = 0 @@ -390,7 +323,7 @@ def test_slice_non_numeric(self, index_func, l, klass): r"\[(3|4)(\.0)?\] " r"of type (float|int)" ) - for idxr in [lambda x: x.loc, lambda x: x.iloc, lambda x: x]: + for idxr in [lambda x: x.loc, lambda x: x]: with pytest.raises(TypeError, match=msg): idxr(s)[l] = 0 diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 683d4f2605712..c97cd81c84726 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -349,7 +349,6 @@ def test_iloc_setitem_dups(self): df = concat([df1, df2], axis=1) expected = df.fillna(3) - expected["A"] = expected["A"].astype("float64") inds = np.isnan(df.iloc[:, 0]) mask = inds[inds].index df.iloc[mask, 0] = df.iloc[mask, 2] @@ -487,7 +486,7 @@ def test_iloc_getitem_doc_issue(self): columns = list(range(0, 8, 2)) df = DataFrame(arr, index=index, columns=columns) - df._data.blocks[0].mgr_locs + df._mgr.blocks[0].mgr_locs result = df.iloc[1:5, 2:4] str(result) result.dtypes @@ -694,3 +693,43 @@ def test_series_indexing_zerodim_np_array(self): s = Series([1, 2]) result = s.iloc[np.array(0)] assert result == 1 + + def test_iloc_setitem_categorical_updates_inplace(self): + # Mixed dtype ensures we go through take_split_path in setitem_with_indexer + cat = pd.Categorical(["A", "B", "C"]) + df = pd.DataFrame({1: cat, 2: [1, 2, 3]}) + + # This should modify our original values in-place + df.iloc[:, 0] = cat[::-1] + + expected = pd.Categorical(["C", "B", "A"]) + tm.assert_categorical_equal(cat, expected) + + +class TestILocSetItemDuplicateColumns: + def test_iloc_setitem_scalar_duplicate_columns(self): + # GH#15686, duplicate columns and mixed dtype + df1 = pd.DataFrame([{"A": None, "B": 1}, {"A": 2, "B": 2}]) + df2 = pd.DataFrame([{"A": 3, "B": 3}, {"A": 4, "B": 4}]) + df = pd.concat([df1, df2], axis=1) + df.iloc[0, 0] = -1 + + assert df.iloc[0, 0] == -1 + assert df.iloc[0, 2] == 3 + assert df.dtypes.iloc[2] == np.int64 + + def test_iloc_setitem_list_duplicate_columns(self): + # GH#22036 setting with same-sized list + df = pd.DataFrame([[0, "str", "str2"]], columns=["a", "b", "b"]) + + df.iloc[:, 2] = ["str3"] + + expected = pd.DataFrame([[0, "str", "str3"]], columns=["a", "b", "b"]) + tm.assert_frame_equal(df, expected) + + def test_iloc_setitem_series_duplicate_columns(self): + df = pd.DataFrame( + np.arange(8, dtype=np.int64).reshape(2, 4), columns=["A", "B", "A", "B"] + ) + df.iloc[:, 0] = df.iloc[:, 0].astype(np.float64) + assert df.dtypes.iloc[2] == np.int64 diff --git a/pandas/tests/indexing/test_indexers.py b/pandas/tests/indexing/test_indexers.py index 173f33b19f8d5..744f9441e7376 100644 --- a/pandas/tests/indexing/test_indexers.py +++ b/pandas/tests/indexing/test_indexers.py @@ -1,7 +1,8 @@ # Tests aimed at pandas.core.indexers import numpy as np +import pytest -from pandas.core.indexers import length_of_indexer +from pandas.core.indexers import is_scalar_indexer, length_of_indexer, validate_indices def test_length_of_indexer(): @@ -9,3 +10,42 @@ def test_length_of_indexer(): arr[0] = 1 result = length_of_indexer(arr) assert result == 1 + + +def test_is_scalar_indexer(): + indexer = (0, 1) + assert is_scalar_indexer(indexer, 2) + assert not is_scalar_indexer(indexer[0], 2) + + indexer = (np.array([2]), 1) + assert is_scalar_indexer(indexer, 2) + + indexer = (np.array([2]), np.array([3])) + assert is_scalar_indexer(indexer, 2) + + indexer = (np.array([2]), np.array([3, 4])) + assert not is_scalar_indexer(indexer, 2) + + assert not is_scalar_indexer(slice(None), 1) + + +class TestValidateIndices: + def test_validate_indices_ok(self): + indices = np.asarray([0, 1]) + validate_indices(indices, 2) + validate_indices(indices[:0], 0) + validate_indices(np.array([-1, -1]), 0) + + def test_validate_indices_low(self): + indices = np.asarray([0, -2]) + with pytest.raises(ValueError, match="'indices' contains"): + validate_indices(indices, 2) + + def test_validate_indices_high(self): + indices = np.asarray([0, 1, 2]) + with pytest.raises(IndexError, match="indices are out"): + validate_indices(indices, 2) + + def test_validate_indices_empty(self): + with pytest.raises(IndexError, match="indices are out"): + validate_indices(np.array([0, 1]), 0) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 8af0fe548e48a..294e3e27c4df5 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -12,7 +12,6 @@ import pandas as pd from pandas import DataFrame, Index, NaT, Series import pandas._testing as tm -from pandas.core.indexers import validate_indices from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice from pandas.tests.indexing.common import _mklbl @@ -52,9 +51,6 @@ def test_setitem_ndarray_1d(self): with pytest.raises(ValueError): df[2:5] = np.arange(1, 4) * 1j - @pytest.mark.parametrize( - "index", tm.all_index_generator(5), ids=lambda x: type(x).__name__ - ) @pytest.mark.parametrize( "obj", [ @@ -71,9 +67,9 @@ def test_setitem_ndarray_1d(self): (lambda x: x.iloc, "iloc"), ], ) - def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): + def test_getitem_ndarray_3d(self, indices, obj, idxr, idxr_id): # GH 25567 - obj = obj(index) + obj = obj(indices) idxr = idxr(obj) nd3 = np.random.randint(5, size=(2, 2, 2)) @@ -83,16 +79,16 @@ def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): "Cannot index with multidimensional key", r"Wrong number of dimensions. values.ndim != ndim \[3 != 1\]", "Index data must be 1-dimensional", + "positional indexers are out-of-bounds", + "Indexing a MultiIndex with a multidimensional key is not implemented", ] ) - with pytest.raises(ValueError, match=msg): + potential_errors = (IndexError, ValueError, NotImplementedError) + with pytest.raises(potential_errors, match=msg): with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): idxr[nd3] - @pytest.mark.parametrize( - "index", tm.all_index_generator(5), ids=lambda x: type(x).__name__ - ) @pytest.mark.parametrize( "obj", [ @@ -109,17 +105,25 @@ def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): (lambda x: x.iloc, "iloc"), ], ) - def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id): + def test_setitem_ndarray_3d(self, indices, obj, idxr, idxr_id): # GH 25567 - obj = obj(index) + obj = obj(indices) idxr = idxr(obj) nd3 = np.random.randint(5, size=(2, 2, 2)) + if ( + (len(indices) == 0) + and (idxr_id == "iloc") + and isinstance(obj, pd.DataFrame) + ): + # gh-32896 + pytest.skip("This is currently failing. There's an xfailed test below.") + if idxr_id == "iloc": err = ValueError msg = f"Cannot set values with ndim > {obj.ndim}" elif ( - isinstance(index, pd.IntervalIndex) + isinstance(indices, pd.IntervalIndex) and idxr_id == "setitem" and obj.ndim == 1 ): @@ -134,6 +138,17 @@ def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id): with pytest.raises(err, match=msg): idxr[nd3] = 0 + @pytest.mark.xfail(reason="gh-32896") + def test_setitem_ndarray_3d_does_not_fail_for_iloc_empty_dataframe(self): + # when fixing this, please remove the pytest.skip in test_setitem_ndarray_3d + i = Index([]) + obj = DataFrame(np.random.randn(len(i), len(i)), index=i, columns=i) + nd3 = np.random.randint(5, size=(2, 2, 2)) + + msg = f"Cannot set values with ndim > {obj.ndim}" + with pytest.raises(ValueError, match=msg): + obj.iloc[nd3] = 0 + def test_inf_upcast(self): # GH 16957 # We should be able to use np.inf as a key @@ -605,69 +620,6 @@ def test_astype_assignment(self): expected = DataFrame({"A": [1, 2, 3, 4]}) tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize( - "index,val", - [ - (Index([0, 1, 2]), 2), - (Index([0, 1, "2"]), "2"), - (Index([0, 1, 2, np.inf, 4]), 4), - (Index([0, 1, 2, np.nan, 4]), 4), - (Index([0, 1, 2, np.inf]), np.inf), - (Index([0, 1, 2, np.nan]), np.nan), - ], - ) - def test_index_contains(self, index, val): - assert val in index - - @pytest.mark.parametrize( - "index,val", - [ - (Index([0, 1, 2]), "2"), - (Index([0, 1, "2"]), 2), - (Index([0, 1, 2, np.inf]), 4), - (Index([0, 1, 2, np.nan]), 4), - (Index([0, 1, 2, np.inf]), np.nan), - (Index([0, 1, 2, np.nan]), np.inf), - # Checking if np.inf in Int64Index should not cause an OverflowError - # Related to GH 16957 - (pd.Int64Index([0, 1, 2]), np.inf), - (pd.Int64Index([0, 1, 2]), np.nan), - (pd.UInt64Index([0, 1, 2]), np.inf), - (pd.UInt64Index([0, 1, 2]), np.nan), - ], - ) - def test_index_not_contains(self, index, val): - assert val not in index - - @pytest.mark.parametrize( - "index,val", [(Index([0, 1, "2"]), 0), (Index([0, 1, "2"]), "2")] - ) - def test_mixed_index_contains(self, index, val): - # GH 19860 - assert val in index - - @pytest.mark.parametrize( - "index,val", [(Index([0, 1, "2"]), "1"), (Index([0, 1, "2"]), 2)] - ) - def test_mixed_index_not_contains(self, index, val): - # GH 19860 - assert val not in index - - def test_contains_with_float_index(self): - # GH#22085 - integer_index = pd.Int64Index([0, 1, 2, 3]) - uinteger_index = pd.UInt64Index([0, 1, 2, 3]) - float_index = pd.Float64Index([0.1, 1.1, 2.2, 3.3]) - - for index in (integer_index, uinteger_index): - assert 1.1 not in index - assert 1.0 in index - assert 1 in index - - assert 1.1 in float_index - assert 1.0 not in float_index - assert 1 not in float_index - def test_index_type_coercion(self): # GH 11836 @@ -1035,30 +987,6 @@ def test_none_coercion_mixed_dtypes(self): tm.assert_frame_equal(start_dataframe, exp) -def test_validate_indices_ok(): - indices = np.asarray([0, 1]) - validate_indices(indices, 2) - validate_indices(indices[:0], 0) - validate_indices(np.array([-1, -1]), 0) - - -def test_validate_indices_low(): - indices = np.asarray([0, -2]) - with pytest.raises(ValueError, match="'indices' contains"): - validate_indices(indices, 2) - - -def test_validate_indices_high(): - indices = np.asarray([0, 1, 2]) - with pytest.raises(IndexError, match="indices are out"): - validate_indices(indices, 2) - - -def test_validate_indices_empty(): - with pytest.raises(IndexError, match="indices are out"): - validate_indices(np.array([0, 1]), 0) - - def test_extension_array_cross_section(): # A cross-section of a homogeneous EA should be an EA df = pd.DataFrame( diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 276d11a67ad18..d1f67981b1ec5 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -35,7 +35,7 @@ def test_loc_getitem_label_out_of_range(self): "loc", 20, typs=["ints", "uints", "mixed"], fails=KeyError, ) self.check_result("loc", 20, typs=["labels"], fails=KeyError) - self.check_result("loc", 20, typs=["ts"], axes=0, fails=TypeError) + self.check_result("loc", 20, typs=["ts"], axes=0, fails=KeyError) self.check_result("loc", 20, typs=["floats"], axes=0, fails=KeyError) def test_loc_getitem_label_list(self): @@ -631,6 +631,64 @@ def test_loc_setitem_with_scalar_index(self, indexer, value): assert is_scalar(result) and result == "Z" + @pytest.mark.parametrize( + "index,box,expected", + [ + ( + ([0, 2], ["A", "B", "C", "D"]), + 7, + pd.DataFrame( + [[7, 7, 7, 7], [3, 4, np.nan, np.nan], [7, 7, 7, 7]], + columns=["A", "B", "C", "D"], + ), + ), + ( + (1, ["C", "D"]), + [7, 8], + pd.DataFrame( + [[1, 2, np.nan, np.nan], [3, 4, 7, 8], [5, 6, np.nan, np.nan]], + columns=["A", "B", "C", "D"], + ), + ), + ( + (1, ["A", "B", "C"]), + np.array([7, 8, 9], dtype=np.int64), + pd.DataFrame( + [[1, 2, np.nan], [7, 8, 9], [5, 6, np.nan]], + columns=["A", "B", "C"], + ), + ), + ( + (slice(1, 3, None), ["B", "C", "D"]), + [[7, 8, 9], [10, 11, 12]], + pd.DataFrame( + [[1, 2, np.nan, np.nan], [3, 7, 8, 9], [5, 10, 11, 12]], + columns=["A", "B", "C", "D"], + ), + ), + ( + (slice(1, 3, None), ["C", "A", "D"]), + np.array([[7, 8, 9], [10, 11, 12]], dtype=np.int64), + pd.DataFrame( + [[1, 2, np.nan, np.nan], [8, 4, 7, 9], [11, 6, 10, 12]], + columns=["A", "B", "C", "D"], + ), + ), + ( + (slice(None, None, None), ["A", "C"]), + pd.DataFrame([[7, 8], [9, 10], [11, 12]], columns=["A", "C"]), + pd.DataFrame( + [[7, 2, 8], [9, 4, 10], [11, 6, 12]], columns=["A", "B", "C"] + ), + ), + ], + ) + def test_loc_setitem_missing_columns(self, index, box, expected): + # GH 29334 + df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"]) + df.loc[index] = box + tm.assert_frame_equal(df, expected) + def test_loc_coercion(self): # 12411 @@ -863,6 +921,7 @@ def test_loc_setitem_empty_append_raises(self): data = [1, 2] df = DataFrame(columns=["x", "y"]) + df.index = df.index.astype(np.int64) msg = ( r"None of \[Int64Index\(\[0, 1\], dtype='int64'\)\] " r"are in the \[index\]" @@ -975,3 +1034,75 @@ def test_loc_mixed_int_float(): result = ser.loc[1] assert result == 0 + + +def test_loc_with_positional_slice_deprecation(): + # GH#31840 + ser = pd.Series(range(4), index=["A", "B", "C", "D"]) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + ser.loc[:3] = 2 + + expected = pd.Series([2, 2, 2, 3], index=["A", "B", "C", "D"]) + tm.assert_series_equal(ser, expected) + + +def test_loc_slice_disallows_positional(): + # GH#16121, GH#24612, GH#31810 + dti = pd.date_range("2016-01-01", periods=3) + df = pd.DataFrame(np.random.random((3, 2)), index=dti) + + ser = df[0] + + msg = ( + "cannot do slice indexing on DatetimeIndex with these " + r"indexers \[1\] of type int" + ) + + for obj in [df, ser]: + with pytest.raises(TypeError, match=msg): + obj.loc[1:3] + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # GH#31840 deprecated incorrect behavior + obj.loc[1:3] = 1 + + with pytest.raises(TypeError, match=msg): + df.loc[1:3, 1] + + with tm.assert_produces_warning(FutureWarning): + # GH#31840 deprecated incorrect behavior + df.loc[1:3, 1] = 2 + + +def test_loc_datetimelike_mismatched_dtypes(): + # GH#32650 dont mix and match datetime/timedelta/period dtypes + + df = pd.DataFrame( + np.random.randn(5, 3), + columns=["a", "b", "c"], + index=pd.date_range("2012", freq="H", periods=5), + ) + # create dataframe with non-unique DatetimeIndex + df = df.iloc[[0, 2, 2, 3]].copy() + + dti = df.index + tdi = pd.TimedeltaIndex(dti.asi8) # matching i8 values + + msg = r"None of \[TimedeltaIndex.* are in the \[index\]" + with pytest.raises(KeyError, match=msg): + df.loc[tdi] + + with pytest.raises(KeyError, match=msg): + df["a"].loc[tdi] + + +def test_loc_with_period_index_indexer(): + # GH#4125 + idx = pd.period_range("2002-01", "2003-12", freq="M") + df = pd.DataFrame(np.random.randn(24, 10), index=idx) + tm.assert_frame_equal(df, df.loc[idx]) + tm.assert_frame_equal(df, df.loc[list(idx)]) + tm.assert_frame_equal(df, df.loc[list(idx)]) + tm.assert_frame_equal(df.iloc[0:5], df.loc[idx[0:5]]) + tm.assert_frame_equal(df, df.loc[list(idx)]) diff --git a/pandas/tests/indexing/test_na_indexing.py b/pandas/tests/indexing/test_na_indexing.py index 345ca30ec77eb..9e8ef6e6e1c22 100644 --- a/pandas/tests/indexing/test_na_indexing.py +++ b/pandas/tests/indexing/test_na_indexing.py @@ -7,6 +7,7 @@ @pytest.mark.parametrize( "values, dtype", [ + ([], "object"), ([1, 2, 3], "int64"), ([1.0, 2.0, 3.0], "float64"), (["a", "b", "c"], "object"), @@ -22,42 +23,43 @@ @pytest.mark.parametrize( "mask", [[True, False, False], [True, True, True], [False, False, False]] ) -@pytest.mark.parametrize("box_mask", [True, False]) +@pytest.mark.parametrize("indexer_class", [list, pd.array, pd.Index, pd.Series]) @pytest.mark.parametrize("frame", [True, False]) -def test_series_mask_boolean(values, dtype, mask, box_mask, frame): - ser = pd.Series(values, dtype=dtype, index=["a", "b", "c"]) - if frame: - ser = ser.to_frame() - mask = pd.array(mask, dtype="boolean") - if box_mask: - mask = pd.Series(mask, index=ser.index) - - expected = ser[mask.astype("bool")] +def test_series_mask_boolean(values, dtype, mask, indexer_class, frame): + # In case len(values) < 3 + index = ["a", "b", "c"][: len(values)] + mask = mask[: len(values)] - result = ser[mask] - tm.assert_equal(result, expected) - - if not box_mask: - # Series.iloc[Series[bool]] isn't allowed - result = ser.iloc[mask] - tm.assert_equal(result, expected) + obj = pd.Series(values, dtype=dtype, index=index) + if frame: + if len(values) == 0: + # Otherwise obj is an empty DataFrame with shape (0, 1) + obj = pd.DataFrame(dtype=dtype) + else: + obj = obj.to_frame() + + if indexer_class is pd.array: + mask = pd.array(mask, dtype="boolean") + elif indexer_class is pd.Series: + mask = pd.Series(mask, index=obj.index, dtype="boolean") + else: + mask = indexer_class(mask) - result = ser.loc[mask] - tm.assert_equal(result, expected) + expected = obj[mask] - # empty - mask = mask[:0] - ser = ser.iloc[:0] - expected = ser[mask.astype("bool")] - result = ser[mask] + result = obj[mask] tm.assert_equal(result, expected) - if not box_mask: - # Series.iloc[Series[bool]] isn't allowed - result = ser.iloc[mask] + if indexer_class is pd.Series: + msg = "iLocation based boolean indexing cannot use an indexable as a mask" + with pytest.raises(ValueError, match=msg): + result = obj.iloc[mask] + tm.assert_equal(result, expected) + else: + result = obj.iloc[mask] tm.assert_equal(result, expected) - result = ser.loc[mask] + result = obj.loc[mask] tm.assert_equal(result, expected) diff --git a/pandas/tests/indexing/test_scalar.py b/pandas/tests/indexing/test_scalar.py index 25939e63c256b..61d109344568c 100644 --- a/pandas/tests/indexing/test_scalar.py +++ b/pandas/tests/indexing/test_scalar.py @@ -4,7 +4,7 @@ import numpy as np import pytest -from pandas import DataFrame, Series, Timedelta, Timestamp, date_range +from pandas import DataFrame, Series, Timedelta, Timestamp, date_range, period_range import pandas._testing as tm from pandas.tests.indexing.common import Base @@ -302,3 +302,12 @@ def test_iat_dont_wrap_object_datetimelike(): assert result is ser2[1] assert isinstance(result, timedelta) assert not isinstance(result, Timedelta) + + +def test_iat_series_with_period_index(): + # GH 4390, iat incorrectly indexing + index = period_range("1/1/2001", periods=10) + ser = Series(np.random.randn(10), index=index) + expected = ser[index[0]] + result = ser.iat[0] + assert expected == result diff --git a/pandas/tests/indexing/test_timedelta.py b/pandas/tests/indexing/test_timedelta.py index dd4750123c0b5..7da368e4bb321 100644 --- a/pandas/tests/indexing/test_timedelta.py +++ b/pandas/tests/indexing/test_timedelta.py @@ -6,7 +6,7 @@ class TestTimedeltaIndexing: - def test_boolean_indexing(self): + def test_loc_setitem_bool_mask(self): # GH 14946 df = pd.DataFrame({"x": range(10)}) df.index = pd.to_timedelta(range(10), unit="s") @@ -17,7 +17,9 @@ def test_boolean_indexing(self): [10, 10, 10, 3, 4, 5, 6, 7, 8, 9], ] for cond, data in zip(conditions, expected_data): - result = df.assign(x=df.mask(cond, 10).astype("int64")) + result = df.copy() + result.loc[cond, "x"] = 10 + expected = pd.DataFrame( data, index=pd.to_timedelta(range(10), unit="s"), @@ -58,7 +60,7 @@ def test_string_indexing(self): tm.assert_series_equal(sliced, expected) @pytest.mark.parametrize("value", [None, pd.NaT, np.nan]) - def test_masked_setitem(self, value): + def test_setitem_mask_na_value_td64(self, value): # issue (#18586) series = pd.Series([0, 1, 2], dtype="timedelta64[ns]") series[series == series[0]] = value diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 27b0500983afd..58a84f5a267bc 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -135,7 +135,7 @@ def create_single_mgr(typestr, num_rows=None): return SingleBlockManager( create_block(typestr, placement=slice(0, num_rows), item_shape=()), - np.arange(num_rows), + Index(np.arange(num_rows)), ) @@ -207,7 +207,6 @@ def setup_method(self, method): self.cblock = create_block("complex", [7]) self.oblock = create_block("object", [1, 3]) self.bool_block = create_block("bool", [5]) - self.int_block = create_block("int", [6]) def test_constructor(self): int32block = create_block("i4", [0]) @@ -233,21 +232,6 @@ def test_attrs(self): assert self.fblock.dtype == self.fblock.values.dtype assert len(self.fblock) == len(self.fblock.values) - def test_merge(self): - avals = tm.randn(2, 10) - bvals = tm.randn(2, 10) - - ref_cols = Index(["e", "a", "b", "d", "f"]) - - ablock = make_block(avals, ref_cols.get_indexer(["e", "b"])) - bblock = make_block(bvals, ref_cols.get_indexer(["a", "d"])) - merged = ablock.merge(bblock) - tm.assert_numpy_array_equal( - merged.mgr_locs.as_array, np.array([0, 1, 2, 3], dtype=np.int64) - ) - tm.assert_numpy_array_equal(merged.values[[0, 2]], np.array(avals)) - tm.assert_numpy_array_equal(merged.values[[1, 3]], np.array(bvals)) - def test_copy(self): cop = self.fblock.copy() assert cop is not self.fblock @@ -309,17 +293,14 @@ def test_duplicate_ref_loc_failure(self): msg = "Gaps in blk ref_locs" with pytest.raises(AssertionError, match=msg): - BlockManager(blocks, axes) + mgr = BlockManager(blocks, axes) + mgr._rebuild_blknos_and_blklocs() blocks[0].mgr_locs = np.array([0]) blocks[1].mgr_locs = np.array([1]) mgr = BlockManager(blocks, axes) mgr.iget(1) - def test_contains(self, mgr): - assert "a" in mgr - assert "baz" not in mgr - def test_pickle(self, mgr): mgr2 = tm.round_trip_pickle(mgr) @@ -333,13 +314,9 @@ def test_pickle(self, mgr): assert not mgr2._is_consolidated assert not mgr2._known_consolidated - def test_non_unique_pickle(self): - - mgr = create_mgr("a,a,a:f8") - mgr2 = tm.round_trip_pickle(mgr) - tm.assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) - - mgr = create_mgr("a: f8; a: i8") + @pytest.mark.parametrize("mgr_string", ["a,a,a:f8", "a: f8; a: i8"]) + def test_non_unique_pickle(self, mgr_string): + mgr = create_mgr(mgr_string) mgr2 = tm.round_trip_pickle(mgr) tm.assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) @@ -427,22 +404,25 @@ def test_sparse_mixed(self): # TODO: what to test here? - def test_as_array_float(self): - mgr = create_mgr("c: f4; d: f2; e: f8") - assert mgr.as_array().dtype == np.float64 - - mgr = create_mgr("c: f4; d: f2") - assert mgr.as_array().dtype == np.float32 - - def test_as_array_int_bool(self): - mgr = create_mgr("a: bool-1; b: bool-2") - assert mgr.as_array().dtype == np.bool_ - - mgr = create_mgr("a: i8-1; b: i8-2; c: i4; d: i2; e: u1") - assert mgr.as_array().dtype == np.int64 + @pytest.mark.parametrize( + "mgr_string, dtype", + [("c: f4; d: f2", np.float32), ("c: f4; d: f2; e: f8", np.float64)], + ) + def test_as_array_float(self, mgr_string, dtype): + mgr = create_mgr(mgr_string) + assert mgr.as_array().dtype == dtype - mgr = create_mgr("c: i4; d: i2; e: u1") - assert mgr.as_array().dtype == np.int32 + @pytest.mark.parametrize( + "mgr_string, dtype", + [ + ("a: bool-1; b: bool-2", np.bool_), + ("a: i8-1; b: i8-2; c: i4; d: i2; e: u1", np.int64), + ("c: i4; d: i2; e: u1", np.int32), + ], + ) + def test_as_array_int_bool(self, mgr_string, dtype): + mgr = create_mgr(mgr_string) + assert mgr.as_array().dtype == dtype def test_as_array_datetime(self): mgr = create_mgr("h: datetime-1; g: datetime-2") @@ -540,8 +520,14 @@ def _compare(old_mgr, new_mgr): assert new_mgr.get("g").dtype == np.float64 assert new_mgr.get("h").dtype == np.float16 - def test_interleave(self): + def test_invalid_ea_block(self): + with pytest.raises(AssertionError, match="block.size != values.size"): + create_mgr("a: category; b: category") + with pytest.raises(AssertionError, match="block.size != values.size"): + create_mgr("a: category2; b: category2") + + def test_interleave(self): # self for dtype in ["f8", "i8", "object", "bool", "complex", "M8[ns]", "m8[ns]"]: mgr = create_mgr(f"a: {dtype}") @@ -549,17 +535,37 @@ def test_interleave(self): mgr = create_mgr(f"a: {dtype}; b: {dtype}") assert mgr.as_array().dtype == dtype + @pytest.mark.parametrize( + "mgr_string, dtype", + [ + ("a: category", "i8"), + ("a: category; b: category", "i8"), + ("a: category; b: category2", "object"), + ("a: category2", "object"), + ("a: category2; b: category2", "object"), + ("a: f8", "f8"), + ("a: f8; b: i8", "f8"), + ("a: f4; b: i8", "f8"), + ("a: f4; b: i8; d: object", "object"), + ("a: bool; b: i8", "object"), + ("a: complex", "complex"), + ("a: f8; b: category", "object"), + ("a: M8[ns]; b: category", "object"), + ("a: M8[ns]; b: bool", "object"), + ("a: M8[ns]; b: i8", "object"), + ("a: m8[ns]; b: bool", "object"), + ("a: m8[ns]; b: i8", "object"), + ("a: M8[ns]; b: m8[ns]", "object"), + ], + ) + def test_interleave_dtype(self, mgr_string, dtype): # will be converted according the actual dtype of the underlying mgr = create_mgr("a: category") assert mgr.as_array().dtype == "i8" - mgr = create_mgr("a: category; b: category") - assert mgr.as_array().dtype == "i8" mgr = create_mgr("a: category; b: category2") assert mgr.as_array().dtype == "object" mgr = create_mgr("a: category2") assert mgr.as_array().dtype == "object" - mgr = create_mgr("a: category2; b: category2") - assert mgr.as_array().dtype == "object" # combinations mgr = create_mgr("a: f8") @@ -686,13 +692,12 @@ def test_get_bool_data(self): def test_unicode_repr_doesnt_raise(self): repr(create_mgr("b,\u05d0: object")) - def test_equals(self): + @pytest.mark.parametrize( + "mgr_string", ["a,b,c: i8-1; d,e,f: i8-2", "a,a,a: i8-1; b,b,b: i8-2"] + ) + def test_equals(self, mgr_string): # unique items - bm1 = create_mgr("a,b,c: i8-1; d,e,f: i8-2") - bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) - assert bm1.equals(bm2) - - bm1 = create_mgr("a,a,a: i8-1; b,b,b: i8-2") + bm1 = create_mgr(mgr_string) bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) assert bm1.equals(bm2) @@ -702,7 +707,7 @@ def test_equals(self): "a:i8;b:f8", # basic case "a:i8;b:f8;c:c8;d:b", # many types "a:i8;e:dt;f:td;g:string", # more types - "a:i8;b:category;c:category2;d:category2", # categories + "a:i8;b:category;c:category2", # categories "c:sparse;d:sparse_na;b:f8", # sparse ], ) @@ -748,11 +753,6 @@ class TestIndexing: create_mgr("a,b,c,d,e,f: i8", item_shape=(N,)), create_mgr("a,b: f8; c,d: i8; e,f: string", item_shape=(N,)), create_mgr("a,b: f8; c,d: i8; e,f: f8", item_shape=(N,)), - # 3-dim - create_mgr("a,b,c,d,e,f: f8", item_shape=(N, N)), - create_mgr("a,b,c,d,e,f: i8", item_shape=(N, N)), - create_mgr("a,b: f8; c,d: i8; e,f: string", item_shape=(N, N)), - create_mgr("a,b: f8; c,d: i8; e,f: f8", item_shape=(N, N)), ] @pytest.mark.parametrize("mgr", MANAGERS) @@ -775,6 +775,7 @@ def assert_slice_ok(mgr, axis, slobj): ) tm.assert_index_equal(mgr.axes[axis][slobj], sliced.axes[axis]) + assert mgr.ndim <= 2, mgr.ndim for ax in range(mgr.ndim): # slice assert_slice_ok(mgr, ax, slice(None)) @@ -906,97 +907,111 @@ def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value): class TestBlockPlacement: - def test_slice_len(self): - assert len(BlockPlacement(slice(0, 4))) == 4 - assert len(BlockPlacement(slice(0, 4, 2))) == 2 - assert len(BlockPlacement(slice(0, 3, 2))) == 2 - - assert len(BlockPlacement(slice(0, 1, 2))) == 1 - assert len(BlockPlacement(slice(1, 0, -1))) == 1 + @pytest.mark.parametrize( + "slc, expected", + [ + (slice(0, 4), 4), + (slice(0, 4, 2), 2), + (slice(0, 3, 2), 2), + (slice(0, 1, 2), 1), + (slice(1, 0, -1), 1), + ], + ) + def test_slice_len(self, slc, expected): + assert len(BlockPlacement(slc)) == expected - def test_zero_step_raises(self): + @pytest.mark.parametrize("slc", [slice(1, 1, 0), slice(1, 2, 0)]) + def test_zero_step_raises(self, slc): msg = "slice step cannot be zero" - with pytest.raises(ValueError, match=msg): - BlockPlacement(slice(1, 1, 0)) + BlockPlacement(slc) + + @pytest.mark.parametrize( + "slc", + [ + slice(None, None), + slice(10, None), + slice(None, None, -1), + slice(None, 10, -1), + # These are "unbounded" because negative index will + # change depending on container shape. + slice(-1, None), + slice(None, -1), + slice(-1, -1), + slice(-1, None, -1), + slice(None, -1, -1), + slice(-1, -1, -1), + ], + ) + def test_unbounded_slice_raises(self, slc): + msg = "unbounded slice" with pytest.raises(ValueError, match=msg): - BlockPlacement(slice(1, 2, 0)) - - def test_unbounded_slice_raises(self): - def assert_unbounded_slice_error(slc): - with pytest.raises(ValueError, match="unbounded slice"): - BlockPlacement(slc) - - assert_unbounded_slice_error(slice(None, None)) - assert_unbounded_slice_error(slice(10, None)) - assert_unbounded_slice_error(slice(None, None, -1)) - assert_unbounded_slice_error(slice(None, 10, -1)) - - # These are "unbounded" because negative index will change depending on - # container shape. - assert_unbounded_slice_error(slice(-1, None)) - assert_unbounded_slice_error(slice(None, -1)) - assert_unbounded_slice_error(slice(-1, -1)) - assert_unbounded_slice_error(slice(-1, None, -1)) - assert_unbounded_slice_error(slice(None, -1, -1)) - assert_unbounded_slice_error(slice(-1, -1, -1)) - - def test_not_slice_like_slices(self): - def assert_not_slice_like(slc): - assert not BlockPlacement(slc).is_slice_like - - assert_not_slice_like(slice(0, 0)) - assert_not_slice_like(slice(100, 0)) - - assert_not_slice_like(slice(100, 100, -1)) - assert_not_slice_like(slice(0, 100, -1)) - - assert not BlockPlacement(slice(0, 0)).is_slice_like - assert not BlockPlacement(slice(100, 100)).is_slice_like - - def test_array_to_slice_conversion(self): - def assert_as_slice_equals(arr, slc): - assert BlockPlacement(arr).as_slice == slc - - assert_as_slice_equals([0], slice(0, 1, 1)) - assert_as_slice_equals([100], slice(100, 101, 1)) - - assert_as_slice_equals([0, 1, 2], slice(0, 3, 1)) - assert_as_slice_equals([0, 5, 10], slice(0, 15, 5)) - assert_as_slice_equals([0, 100], slice(0, 200, 100)) - - assert_as_slice_equals([2, 1], slice(2, 0, -1)) - - def test_not_slice_like_arrays(self): - def assert_not_slice_like(arr): - assert not BlockPlacement(arr).is_slice_like - - assert_not_slice_like([]) - assert_not_slice_like([-1]) - assert_not_slice_like([-1, -2, -3]) - assert_not_slice_like([-10]) - assert_not_slice_like([-1]) - assert_not_slice_like([-1, 0, 1, 2]) - assert_not_slice_like([-2, 0, 2, 4]) - assert_not_slice_like([1, 0, -1]) - assert_not_slice_like([1, 1, 1]) - - def test_slice_iter(self): - assert list(BlockPlacement(slice(0, 3))) == [0, 1, 2] - assert list(BlockPlacement(slice(0, 0))) == [] - assert list(BlockPlacement(slice(3, 0))) == [] - - def test_slice_to_array_conversion(self): - def assert_as_array_equals(slc, asarray): - tm.assert_numpy_array_equal( - BlockPlacement(slc).as_array, np.asarray(asarray, dtype=np.int64) - ) + BlockPlacement(slc) + + @pytest.mark.parametrize( + "slc", + [ + slice(0, 0), + slice(100, 0), + slice(100, 100), + slice(100, 100, -1), + slice(0, 100, -1), + ], + ) + def test_not_slice_like_slices(self, slc): + assert not BlockPlacement(slc).is_slice_like + + @pytest.mark.parametrize( + "arr, slc", + [ + ([0], slice(0, 1, 1)), + ([100], slice(100, 101, 1)), + ([0, 1, 2], slice(0, 3, 1)), + ([0, 5, 10], slice(0, 15, 5)), + ([0, 100], slice(0, 200, 100)), + ([2, 1], slice(2, 0, -1)), + ], + ) + def test_array_to_slice_conversion(self, arr, slc): + assert BlockPlacement(arr).as_slice == slc - assert_as_array_equals(slice(0, 3), [0, 1, 2]) - assert_as_array_equals(slice(0, 0), []) - assert_as_array_equals(slice(3, 0), []) + @pytest.mark.parametrize( + "arr", + [ + [], + [-1], + [-1, -2, -3], + [-10], + [-1], + [-1, 0, 1, 2], + [-2, 0, 2, 4], + [1, 0, -1], + [1, 1, 1], + ], + ) + def test_not_slice_like_arrays(self, arr): + assert not BlockPlacement(arr).is_slice_like + + @pytest.mark.parametrize( + "slc, expected", + [(slice(0, 3), [0, 1, 2]), (slice(0, 0), []), (slice(3, 0), [])], + ) + def test_slice_iter(self, slc, expected): + assert list(BlockPlacement(slc)) == expected - assert_as_array_equals(slice(3, 0, -1), [3, 2, 1]) + @pytest.mark.parametrize( + "slc, arr", + [ + (slice(0, 3), [0, 1, 2]), + (slice(0, 0), []), + (slice(3, 0), []), + (slice(3, 0, -1), [3, 2, 1]), + ], + ) + def test_slice_to_array_conversion(self, slc, arr): + tm.assert_numpy_array_equal( + BlockPlacement(slc).as_array, np.asarray(arr, dtype=np.int64) + ) def test_blockplacement_add(self): bpl = BlockPlacement(slice(0, 5)) @@ -1004,30 +1019,30 @@ def test_blockplacement_add(self): assert bpl.add(np.arange(5)).as_slice == slice(0, 10, 2) assert list(bpl.add(np.arange(5, 0, -1))) == [5, 5, 5, 5, 5] - def test_blockplacement_add_int(self): - def assert_add_equals(val, inc, result): - assert list(BlockPlacement(val).add(inc)) == result - - assert_add_equals(slice(0, 0), 0, []) - assert_add_equals(slice(1, 4), 0, [1, 2, 3]) - assert_add_equals(slice(3, 0, -1), 0, [3, 2, 1]) - assert_add_equals([1, 2, 4], 0, [1, 2, 4]) - - assert_add_equals(slice(0, 0), 10, []) - assert_add_equals(slice(1, 4), 10, [11, 12, 13]) - assert_add_equals(slice(3, 0, -1), 10, [13, 12, 11]) - assert_add_equals([1, 2, 4], 10, [11, 12, 14]) - - assert_add_equals(slice(0, 0), -1, []) - assert_add_equals(slice(1, 4), -1, [0, 1, 2]) - assert_add_equals([1, 2, 4], -1, [0, 1, 3]) + @pytest.mark.parametrize( + "val, inc, expected", + [ + (slice(0, 0), 0, []), + (slice(1, 4), 0, [1, 2, 3]), + (slice(3, 0, -1), 0, [3, 2, 1]), + ([1, 2, 4], 0, [1, 2, 4]), + (slice(0, 0), 10, []), + (slice(1, 4), 10, [11, 12, 13]), + (slice(3, 0, -1), 10, [13, 12, 11]), + ([1, 2, 4], 10, [11, 12, 14]), + (slice(0, 0), -1, []), + (slice(1, 4), -1, [0, 1, 2]), + ([1, 2, 4], -1, [0, 1, 3]), + ], + ) + def test_blockplacement_add_int(self, val, inc, expected): + assert list(BlockPlacement(val).add(inc)) == expected + @pytest.mark.parametrize("val", [slice(1, 4), [1, 2, 4]]) + def test_blockplacement_add_int_raises(self, val): msg = "iadd causes length change" - with pytest.raises(ValueError, match=msg): - BlockPlacement(slice(1, 4)).add(-10) - with pytest.raises(ValueError, match=msg): - BlockPlacement([1, 2, 4]).add(-10) + BlockPlacement(val).add(-10) class DummyElement: @@ -1155,6 +1170,23 @@ def test_binop_other(self, op, value, dtype): tm.assert_series_equal(result, expected) +class TestShouldStore: + def test_should_store_categorical(self): + cat = pd.Categorical(["A", "B", "C"]) + df = pd.DataFrame(cat) + blk = df._mgr.blocks[0] + + # matching dtype + assert blk.should_store(cat) + assert blk.should_store(cat[:-1]) + + # different dtype + assert not blk.should_store(cat.as_ordered()) + + # ndarray instead of Categorical + assert not blk.should_store(np.asarray(cat)) + + @pytest.mark.parametrize( "typestr, holder", [ @@ -1184,7 +1216,7 @@ def test_block_shape(): a = pd.Series([1, 2, 3]).reindex(idx) b = pd.Series(pd.Categorical([1, 2, 3])).reindex(idx) - assert a._data.blocks[0].mgr_locs.indexer == b._data.blocks[0].mgr_locs.indexer + assert a._mgr.blocks[0].mgr_locs.indexer == b._mgr.blocks[0].mgr_locs.indexer def test_make_block_no_pandas_array(): @@ -1246,3 +1278,11 @@ def test_interleave_non_unique_cols(): assert df_unique.values.shape == df.values.shape tm.assert_numpy_array_equal(df_unique.values[0], df.values[0]) tm.assert_numpy_array_equal(df_unique.values[1], df.values[1]) + + +def test_single_block_manager_fastpath_deprecated(): + # GH#33092 + ser = pd.Series(range(3)) + blk = ser._data.blocks[0] + with tm.assert_produces_warning(FutureWarning): + SingleBlockManager(blk, ser.index, fastpath=True) diff --git a/pandas/tests/io/data/excel/high_surrogate.xlsx b/pandas/tests/io/data/excel/high_surrogate.xlsx new file mode 100644 index 0000000000000..1e29b6bee6586 Binary files /dev/null and b/pandas/tests/io/data/excel/high_surrogate.xlsx differ diff --git a/pandas/tests/io/data/excel/test_spaces.ods b/pandas/tests/io/data/excel/test_spaces.ods new file mode 100644 index 0000000000000..375e839c8c221 Binary files /dev/null and b/pandas/tests/io/data/excel/test_spaces.ods differ diff --git a/pandas/tests/io/data/excel/test_spaces.xls b/pandas/tests/io/data/excel/test_spaces.xls new file mode 100644 index 0000000000000..316db172360d0 Binary files /dev/null and b/pandas/tests/io/data/excel/test_spaces.xls differ diff --git a/pandas/tests/io/data/excel/test_spaces.xlsb b/pandas/tests/io/data/excel/test_spaces.xlsb new file mode 100644 index 0000000000000..e38b6c2d8f170 Binary files /dev/null and b/pandas/tests/io/data/excel/test_spaces.xlsb differ diff --git a/pandas/tests/io/data/excel/test_spaces.xlsm b/pandas/tests/io/data/excel/test_spaces.xlsm new file mode 100644 index 0000000000000..a41ebe5bb0e65 Binary files /dev/null and b/pandas/tests/io/data/excel/test_spaces.xlsm differ diff --git a/pandas/tests/io/data/excel/test_spaces.xlsx b/pandas/tests/io/data/excel/test_spaces.xlsx new file mode 100644 index 0000000000000..9071543c4739b Binary files /dev/null and b/pandas/tests/io/data/excel/test_spaces.xlsx differ diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 10ed192062d9c..60c943d95e510 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -114,7 +114,7 @@ def test_to_excel_with_openpyxl_engine(ext, tmpdir): df2 = DataFrame({"B": np.linspace(1, 20, 10)}) df = pd.concat([df1, df2], axis=1) styled = df.style.applymap( - lambda val: "color: %s" % "red" if val < 0 else "black" + lambda val: "color: %s" % ("red" if val < 0 else "black") ).highlight_max() filename = tmpdir / "styled.xlsx" diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index a59b409809eed..99447c03e89af 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -426,7 +426,8 @@ def test_reader_dtype(self, read_ext): expected["c"] = ["001", "002", "003", "004"] tm.assert_frame_equal(actual, expected) - with pytest.raises(ValueError): + msg = "Unable to convert column d to type int64" + with pytest.raises(ValueError, match=msg): pd.read_excel(basename + read_ext, dtype={"d": "int64"}) @pytest.mark.parametrize( @@ -463,6 +464,24 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): actual = pd.read_excel(basename + read_ext, dtype=dtype) tm.assert_frame_equal(actual, expected) + def test_reader_spaces(self, read_ext): + # see gh-32207 + basename = "test_spaces" + + actual = pd.read_excel(basename + read_ext) + expected = DataFrame( + { + "testcol": [ + "this is great", + "4 spaces", + "1 trailing ", + " 1 leading", + "2 spaces multiple times", + ] + } + ) + tm.assert_frame_equal(actual, expected) + def test_reading_all_sheets(self, read_ext): # Test reading all sheetnames by setting sheetname to None, # Ensure a dict is returned. @@ -629,6 +648,17 @@ def test_read_from_py_localpath(self, read_ext): tm.assert_frame_equal(expected, actual) + @td.check_file_leaks + def test_close_from_py_localpath(self, read_ext): + + # GH31467 + str_path = os.path.join("test1" + read_ext) + with open(str_path, "rb") as f: + x = pd.read_excel(f, "Sheet1", index_col=0) + del x + # should not throw an exception because the passed file was closed + f.read() + def test_reader_seconds(self, read_ext): if pd.read_excel.keywords["engine"] == "pyxlsb": pytest.xfail("Sheets containing datetimes not supported by pyxlsb") @@ -811,13 +841,15 @@ def test_excel_old_index_format(self, read_ext): def test_read_excel_bool_header_arg(self, read_ext): # GH 6114 + msg = "Passing a bool to header is invalid" for arg in [True, False]: - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): pd.read_excel("test1" + read_ext, header=arg) def test_read_excel_chunksize(self, read_ext): # GH 8011 - with pytest.raises(NotImplementedError): + msg = "chunksize keyword of read_excel is not implemented" + with pytest.raises(NotImplementedError, match=msg): pd.read_excel("test1" + read_ext, chunksize=100) def test_read_excel_skiprows_list(self, read_ext): @@ -1020,10 +1052,10 @@ def test_excel_read_buffer(self, engine, read_ext): tm.assert_frame_equal(expected, actual) def test_reader_closes_file(self, engine, read_ext): - f = open("test1" + read_ext, "rb") - with pd.ExcelFile(f) as xlsx: - # parses okay - pd.read_excel(xlsx, "Sheet1", index_col=0, engine=engine) + with open("test1" + read_ext, "rb") as f: + with pd.ExcelFile(f) as xlsx: + # parses okay + pd.read_excel(xlsx, "Sheet1", index_col=0, engine=engine) assert f.closed @@ -1044,3 +1076,11 @@ def test_excel_read_binary(self, engine, read_ext): actual = pd.read_excel(data, engine=engine) tm.assert_frame_equal(expected, actual) + + def test_excel_high_surrogate(self, engine): + # GH 23809 + expected = pd.DataFrame(["\udc88"], columns=["Column1"]) + + # should not produce a segmentation violation + actual = pd.read_excel("high_surrogate.xlsx") + tm.assert_frame_equal(expected, actual) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 506d223dbedb4..0811f2f822198 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -330,7 +330,8 @@ def test_excel_sheet_by_name_raise(self, path): tm.assert_frame_equal(gt, df) - with pytest.raises(xlrd.XLRDError): + msg = "No sheet named <'0'>" + with pytest.raises(xlrd.XLRDError, match=msg): pd.read_excel(xl, "0") def test_excel_writer_context_manager(self, frame, path): @@ -452,7 +453,7 @@ def test_float_types(self, np_type, path): reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0).astype(np_type) - tm.assert_frame_equal(df, recons, check_dtype=False) + tm.assert_frame_equal(df, recons) @pytest.mark.parametrize("np_type", [np.bool8, np.bool_]) def test_bool_types(self, np_type, path): @@ -564,7 +565,7 @@ def test_roundtrip_indexlabels(self, merge_cells, frame, path): reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=[0, 1]) - tm.assert_frame_equal(df, recons, check_less_precise=True) + tm.assert_frame_equal(df, recons) def test_excel_roundtrip_indexname(self, merge_cells, path): df = DataFrame(np.random.randn(10, 4)) @@ -973,7 +974,11 @@ def roundtrip(data, header=True, parser_hdr=0, index=True): # This if will be removed once multi-column Excel writing # is implemented. For now fixing gh-9794. if c_idx_nlevels > 1: - with pytest.raises(NotImplementedError): + msg = ( + "Writing to Excel with MultiIndex columns and no index " + "\\('index'=False\\) is not yet implemented." + ) + with pytest.raises(NotImplementedError, match=msg): roundtrip(df, use_headers, index=False) else: res = roundtrip(df, use_headers) diff --git a/pandas/tests/io/excel/test_xlwt.py b/pandas/tests/io/excel/test_xlwt.py index 01feab08eb5e3..a2d8b9fce9767 100644 --- a/pandas/tests/io/excel/test_xlwt.py +++ b/pandas/tests/io/excel/test_xlwt.py @@ -18,7 +18,12 @@ def test_excel_raise_error_on_multiindex_columns_and_no_index(ext): [("site", ""), ("2014", "height"), ("2014", "weight")] ) df = DataFrame(np.random.randn(10, 3), columns=cols) - with pytest.raises(NotImplementedError): + + msg = ( + "Writing to Excel with MultiIndex columns and no index " + "\\('index'=False\\) is not yet implemented." + ) + with pytest.raises(NotImplementedError, match=msg): with tm.ensure_clean(ext) as path: df.to_excel(path, index=False) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index bf7b98eb78f11..f3c3344992942 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -1508,7 +1508,8 @@ def test_to_string_specified_header(self): assert df_s == expected - with pytest.raises(ValueError): + msg = "Writing 2 cols but got 1 aliases" + with pytest.raises(ValueError, match=msg): df.to_string(header=["X"]) def test_to_string_no_index(self): @@ -3002,13 +3003,13 @@ def test_days_neg(self): def test_subdays(self): y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit="s") result = fmt.Timedelta64Formatter(y, box=True).get_result() - assert result[0].strip() == "'00:00:00'" - assert result[1].strip() == "'00:00:01'" + assert result[0].strip() == "'0 days 00:00:00'" + assert result[1].strip() == "'0 days 00:00:01'" def test_subdays_neg(self): y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit="s") result = fmt.Timedelta64Formatter(-y, box=True).get_result() - assert result[0].strip() == "'00:00:00'" + assert result[0].strip() == "'0 days 00:00:00'" assert result[1].strip() == "'-1 days +23:59:59'" def test_zero(self): diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index a2659079be7c0..ec4614538004c 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -37,7 +37,8 @@ def h(x, foo="bar"): ] def test_init_non_pandas(self): - with pytest.raises(TypeError): + msg = "``data`` must be a Series or DataFrame" + with pytest.raises(TypeError, match=msg): Styler([1, 2, 3]) def test_init_series(self): @@ -1013,7 +1014,8 @@ def test_bar_align_zero_nans(self): def test_bar_bad_align_raises(self): df = pd.DataFrame({"A": [-100, -60, -30, -20]}) - with pytest.raises(ValueError): + msg = "`align` must be one of {'left', 'zero',' mid'}" + with pytest.raises(ValueError, match=msg): df.style.bar(align="poorly", color=["#d65f5f", "#5fba7d"]) def test_format_with_na_rep(self): @@ -1082,7 +1084,8 @@ def test_format_non_numeric_na(self): def test_format_with_bad_na_rep(self): # GH 21527 28358 df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) - with pytest.raises(TypeError): + msg = "Expected a string, got -1 instead" + with pytest.raises(TypeError, match=msg): df.style.format(None, na_rep=-1) def test_highlight_null(self, null_color="red"): @@ -1091,12 +1094,30 @@ def test_highlight_null(self, null_color="red"): expected = {(0, 0): [""], (1, 0): ["background-color: red"]} assert result == expected + def test_highlight_null_subset(self): + # GH 31345 + df = pd.DataFrame({"A": [0, np.nan], "B": [0, np.nan]}) + result = ( + df.style.highlight_null(null_color="red", subset=["A"]) + .highlight_null(null_color="green", subset=["B"]) + ._compute() + .ctx + ) + expected = { + (0, 0): [""], + (1, 0): ["background-color: red"], + (0, 1): [""], + (1, 1): ["background-color: green"], + } + assert result == expected + def test_nonunique_raises(self): df = pd.DataFrame([[1, 2]], columns=["A", "A"]) - with pytest.raises(ValueError): + msg = "style is not supported for non-unique indices." + with pytest.raises(ValueError, match=msg): df.style - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Styler(df) def test_caption(self): @@ -1243,9 +1264,12 @@ def test_display_format(self): def test_display_format_raises(self): df = pd.DataFrame(np.random.randn(2, 2)) - with pytest.raises(TypeError): + msg = "Expected a template string or callable, got 5 instead" + with pytest.raises(TypeError, match=msg): df.style.format(5) - with pytest.raises(TypeError): + + msg = "Expected a template string or callable, got True instead" + with pytest.raises(TypeError, match=msg): df.style.format(True) def test_display_set_precision(self): @@ -1318,19 +1342,21 @@ def test_display_dict(self): def test_bad_apply_shape(self): df = pd.DataFrame([[1, 2], [3, 4]]) - with pytest.raises(ValueError): + msg = "returned the wrong shape" + with pytest.raises(ValueError, match=msg): df.style._apply(lambda x: "x", subset=pd.IndexSlice[[0, 1], :]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): df.style._apply(lambda x: [""], subset=pd.IndexSlice[[0, 1], :]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): df.style._apply(lambda x: ["", "", "", ""]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): df.style._apply(lambda x: ["", "", ""], subset=1) - with pytest.raises(ValueError): + msg = "Length mismatch: Expected axis has 3 elements" + with pytest.raises(ValueError, match=msg): df.style._apply(lambda x: ["", "", ""], axis=1) def test_apply_bad_return(self): @@ -1338,7 +1364,8 @@ def f(x): return "" df = pd.DataFrame([[1, 2], [3, 4]]) - with pytest.raises(TypeError): + msg = "must return a DataFrame when passed to `Styler.apply` with axis=None" + with pytest.raises(TypeError, match=msg): df.style._apply(f, axis=None) def test_apply_bad_labels(self): @@ -1346,7 +1373,8 @@ def f(x): return pd.DataFrame(index=[1, 2], columns=["a", "b"]) df = pd.DataFrame([[1, 2], [3, 4]]) - with pytest.raises(ValueError): + msg = "must have identical index and columns as the input" + with pytest.raises(ValueError, match=msg): df.style._apply(f, axis=None) def test_get_level_lengths(self): diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index c2fbc59b8f482..509e5bcb33304 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -664,7 +664,8 @@ def test_to_latex_specified_header(self): assert withoutescape_result == withoutescape_expected - with pytest.raises(ValueError): + msg = "Writing 2 cols but got 1 aliases" + with pytest.raises(ValueError, match=msg): df.to_latex(header=["A"]) def test_to_latex_decimal(self, float_frame): diff --git a/pandas/tests/io/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py old mode 100755 new mode 100644 index ca853ba5f00f5..e64103bd2cde8 --- a/pandas/tests/io/generate_legacy_storage_files.py +++ b/pandas/tests/io/generate_legacy_storage_files.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - """ self-contained to write legacy storage pickle files diff --git a/pandas/tests/io/json/test_deprecated_kwargs.py b/pandas/tests/io/json/test_deprecated_kwargs.py new file mode 100644 index 0000000000000..79245bc9d34a8 --- /dev/null +++ b/pandas/tests/io/json/test_deprecated_kwargs.py @@ -0,0 +1,31 @@ +""" +Tests for the deprecated keyword arguments for `read_json`. +""" + +import pandas as pd +import pandas._testing as tm + +from pandas.io.json import read_json + + +def test_deprecated_kwargs(): + df = pd.DataFrame({"A": [2, 4, 6], "B": [3, 6, 9]}, index=[0, 1, 2]) + buf = df.to_json(orient="split") + with tm.assert_produces_warning(FutureWarning): + tm.assert_frame_equal(df, read_json(buf, "split")) + buf = df.to_json(orient="columns") + with tm.assert_produces_warning(FutureWarning): + tm.assert_frame_equal(df, read_json(buf, "columns")) + buf = df.to_json(orient="index") + with tm.assert_produces_warning(FutureWarning): + tm.assert_frame_equal(df, read_json(buf, "index")) + + +def test_good_kwargs(): + df = pd.DataFrame({"A": [2, 4, 6], "B": [3, 6, 9]}, index=[0, 1, 2]) + with tm.assert_produces_warning(None): + tm.assert_frame_equal(df, read_json(df.to_json(orient="split"), orient="split")) + tm.assert_frame_equal( + df, read_json(df.to_json(orient="columns"), orient="columns") + ) + tm.assert_frame_equal(df, read_json(df.to_json(orient="index"), orient="index")) diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 2ac2acc6748d1..0437052e2740d 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -103,19 +103,23 @@ class TestTableSchemaType: @pytest.mark.parametrize("int_type", [np.int, np.int16, np.int32, np.int64]) def test_as_json_table_type_int_data(self, int_type): int_data = [1, 2, 3] - assert as_json_table_type(np.array(int_data, dtype=int_type)) == "integer" + assert as_json_table_type(np.array(int_data, dtype=int_type).dtype) == "integer" @pytest.mark.parametrize( "float_type", [np.float, np.float16, np.float32, np.float64] ) def test_as_json_table_type_float_data(self, float_type): float_data = [1.0, 2.0, 3.0] - assert as_json_table_type(np.array(float_data, dtype=float_type)) == "number" + assert ( + as_json_table_type(np.array(float_data, dtype=float_type).dtype) == "number" + ) @pytest.mark.parametrize("bool_type", [bool, np.bool]) def test_as_json_table_type_bool_data(self, bool_type): bool_data = [True, False] - assert as_json_table_type(np.array(bool_data, dtype=bool_type)) == "boolean" + assert ( + as_json_table_type(np.array(bool_data, dtype=bool_type).dtype) == "boolean" + ) @pytest.mark.parametrize( "date_data", @@ -128,11 +132,11 @@ def test_as_json_table_type_bool_data(self, bool_type): ], ) def test_as_json_table_type_date_data(self, date_data): - assert as_json_table_type(date_data) == "datetime" + assert as_json_table_type(date_data.dtype) == "datetime" @pytest.mark.parametrize("str_data", [pd.Series(["a", "b"]), pd.Index(["a", "b"])]) def test_as_json_table_type_string_data(self, str_data): - assert as_json_table_type(str_data) == "string" + assert as_json_table_type(str_data.dtype) == "string" @pytest.mark.parametrize( "cat_data", @@ -145,7 +149,7 @@ def test_as_json_table_type_string_data(self, str_data): ], ) def test_as_json_table_type_categorical_data(self, cat_data): - assert as_json_table_type(cat_data) == "any" + assert as_json_table_type(cat_data.dtype) == "any" # ------ # dtypes @@ -189,7 +193,7 @@ def test_as_json_table_type_categorical_dtypes(self): # TODO: I think before is_categorical_dtype(Categorical) # returned True, but now it's False. Figure out why or # if it matters - assert as_json_table_type(pd.Categorical(["a"])) == "any" + assert as_json_table_type(pd.Categorical(["a"]).dtype) == "any" assert as_json_table_type(CategoricalDtype()) == "any" @@ -237,6 +241,24 @@ def test_build_series(self): assert result == expected + def test_read_json_from_to_json_results(self): + # GH32383 + df = pd.DataFrame( + { + "_id": {"row_0": 0}, + "category": {"row_0": "Goods"}, + "recommender_id": {"row_0": 3}, + "recommender_name_jp": {"row_0": "浦田"}, + "recommender_name_en": {"row_0": "Urata"}, + "name_jp": {"row_0": "博多人形(松尾吉将まつお よしまさ)"}, + "name_en": {"row_0": "Hakata Dolls Matsuo"}, + } + ) + result1 = pd.read_json(df.to_json()) + result2 = pd.DataFrame.from_dict(json.loads(df.to_json())) + tm.assert_frame_equal(result1, df) + tm.assert_frame_equal(result2, df) + def test_to_json(self): df = self.df.copy() df.index.name = "idx" @@ -603,8 +625,7 @@ def test_timestamp_in_columns(self): result = df.to_json(orient="table") js = json.loads(result) assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000Z" - # TODO - below expectation is not correct; see GH 28256 - assert js["schema"]["fields"][2]["name"] == 10000 + assert js["schema"]["fields"][2]["name"] == "P0DT0H0M10S" @pytest.mark.parametrize( "case", diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 91b204ed41ebc..b7a9918ff46da 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -486,6 +486,16 @@ def test_non_interable_record_path_errors(self): with pytest.raises(TypeError, match=msg): json_normalize([test_input], record_path=[test_path]) + def test_meta_non_iterable(self): + # GH 31507 + data = """[{"id": 99, "data": [{"one": 1, "two": 2}]}]""" + + result = json_normalize(json.loads(data), record_path=["data"], meta=["id"]) + expected = DataFrame( + {"one": [1], "two": [2], "id": np.array([99], dtype=object)} + ) + tm.assert_frame_equal(result, expected) + class TestNestedToRecord: def test_flat_stays_flat(self): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index f2d35bfb3b5ae..0576d8e91d531 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -16,21 +16,15 @@ import pandas._testing as tm _seriesd = tm.getSeriesData() -_tsd = tm.getTimeSeriesData() _frame = DataFrame(_seriesd) -_frame2 = DataFrame(_seriesd, columns=["D", "C", "B", "A"]) -_intframe = DataFrame({k: v.astype(np.int64) for k, v in _seriesd.items()}) -_tsframe = DataFrame(_tsd) _cat_frame = _frame.copy() cat = ["bah"] * 5 + ["bar"] * 5 + ["baz"] * 5 + ["foo"] * (len(_cat_frame) - 15) _cat_frame.index = pd.CategoricalIndex(cat, name="E") _cat_frame["E"] = list(reversed(cat)) _cat_frame["sort"] = np.arange(len(_cat_frame), dtype="int64") -_mixed_frame = _frame.copy() - def assert_json_roundtrip_equal(result, expected, orient): if orient == "records" or orient == "values": @@ -42,48 +36,12 @@ def assert_json_roundtrip_equal(result, expected, orient): @pytest.mark.filterwarnings("ignore:the 'numpy' keyword is deprecated:FutureWarning") class TestPandasContainer: - @pytest.fixture(scope="function", autouse=True) - def setup(self, datapath): - self.dirpath = datapath("io", "json", "data") - - self.ts = tm.makeTimeSeries() - self.ts.name = "ts" - - self.series = tm.makeStringSeries() - self.series.name = "series" - - self.objSeries = tm.makeObjectSeries() - self.objSeries.name = "objects" - - self.empty_series = Series([], index=[], dtype=np.float64) - self.empty_frame = DataFrame() - - self.frame = _frame.copy() - self.frame2 = _frame2.copy() - self.intframe = _intframe.copy() - self.tsframe = _tsframe.copy() - self.mixed_frame = _mixed_frame.copy() + @pytest.fixture(autouse=True) + def setup(self): self.categorical = _cat_frame.copy() yield - del self.dirpath - - del self.ts - - del self.series - - del self.objSeries - - del self.empty_series - del self.empty_frame - - del self.frame - del self.frame2 - del self.intframe - del self.tsframe - del self.mixed_frame - def test_frame_double_encoded_labels(self, orient): df = DataFrame( [["a", "b"], ["c", "d"]], @@ -148,31 +106,31 @@ def test_frame_non_unique_columns_raises(self, orient): with pytest.raises(ValueError, match=msg): df.to_json(orient=orient) - def test_frame_default_orient(self): - assert self.frame.to_json() == self.frame.to_json(orient="columns") + def test_frame_default_orient(self, float_frame): + assert float_frame.to_json() == float_frame.to_json(orient="columns") @pytest.mark.parametrize("dtype", [False, float]) @pytest.mark.parametrize("convert_axes", [True, False]) @pytest.mark.parametrize("numpy", [True, False]) - def test_roundtrip_simple(self, orient, convert_axes, numpy, dtype): - data = self.frame.to_json(orient=orient) + def test_roundtrip_simple(self, orient, convert_axes, numpy, dtype, float_frame): + data = float_frame.to_json(orient=orient) result = pd.read_json( data, orient=orient, convert_axes=convert_axes, numpy=numpy, dtype=dtype ) - expected = self.frame.copy() + expected = float_frame assert_json_roundtrip_equal(result, expected, orient) @pytest.mark.parametrize("dtype", [False, np.int64]) @pytest.mark.parametrize("convert_axes", [True, False]) @pytest.mark.parametrize("numpy", [True, False]) - def test_roundtrip_intframe(self, orient, convert_axes, numpy, dtype): - data = self.intframe.to_json(orient=orient) + def test_roundtrip_intframe(self, orient, convert_axes, numpy, dtype, int_frame): + data = int_frame.to_json(orient=orient) result = pd.read_json( data, orient=orient, convert_axes=convert_axes, numpy=numpy, dtype=dtype ) - expected = self.intframe.copy() + expected = int_frame if ( numpy and (is_platform_32bit() or is_platform_windows()) @@ -248,12 +206,12 @@ def test_roundtrip_categorical(self, orient, convert_axes, numpy): @pytest.mark.parametrize("convert_axes", [True, False]) @pytest.mark.parametrize("numpy", [True, False]) - def test_roundtrip_empty(self, orient, convert_axes, numpy): - data = self.empty_frame.to_json(orient=orient) + def test_roundtrip_empty(self, orient, convert_axes, numpy, empty_frame): + data = empty_frame.to_json(orient=orient) result = pd.read_json( data, orient=orient, convert_axes=convert_axes, numpy=numpy ) - expected = self.empty_frame.copy() + expected = empty_frame.copy() # TODO: both conditions below are probably bugs if convert_axes: @@ -266,13 +224,13 @@ def test_roundtrip_empty(self, orient, convert_axes, numpy): @pytest.mark.parametrize("convert_axes", [True, False]) @pytest.mark.parametrize("numpy", [True, False]) - def test_roundtrip_timestamp(self, orient, convert_axes, numpy): + def test_roundtrip_timestamp(self, orient, convert_axes, numpy, datetime_frame): # TODO: improve coverage with date_format parameter - data = self.tsframe.to_json(orient=orient) + data = datetime_frame.to_json(orient=orient) result = pd.read_json( data, orient=orient, convert_axes=convert_axes, numpy=numpy ) - expected = self.tsframe.copy() + expected = datetime_frame.copy() if not convert_axes: # one off for ts handling # DTI gets converted to epoch values @@ -457,7 +415,7 @@ def test_frame_mixedtype_orient(self): # GH10289 left = read_json(inp, orient="values", convert_axes=False) tm.assert_frame_equal(left, right) - def test_v12_compat(self): + def test_v12_compat(self, datapath): df = DataFrame( [ [1.56808523, 0.65727391, 1.81021139, -0.17251653], @@ -474,12 +432,13 @@ def test_v12_compat(self): df["modified"] = df["date"] df.iloc[1, df.columns.get_loc("modified")] = pd.NaT - v12_json = os.path.join(self.dirpath, "tsframe_v012.json") + dirpath = datapath("io", "json", "data") + v12_json = os.path.join(dirpath, "tsframe_v012.json") df_unser = pd.read_json(v12_json) tm.assert_frame_equal(df, df_unser) df_iso = df.drop(["modified"], axis=1) - v12_iso_json = os.path.join(self.dirpath, "tsframe_iso_v012.json") + v12_iso_json = os.path.join(dirpath, "tsframe_iso_v012.json") df_unser_iso = pd.read_json(v12_iso_json) tm.assert_frame_equal(df_iso, df_unser_iso) @@ -572,7 +531,6 @@ def test_blocks_compat_GH9037(self): df_roundtrip, check_index_type=True, check_column_type=True, - check_frame_type=True, by_blocks=True, check_exact=True, ) @@ -634,15 +592,15 @@ def test_series_non_unique_index(self): unser = read_json(s.to_json(orient="records"), orient="records", typ="series") tm.assert_numpy_array_equal(s.values, unser.values) - def test_series_default_orient(self): - assert self.series.to_json() == self.series.to_json(orient="index") + def test_series_default_orient(self, string_series): + assert string_series.to_json() == string_series.to_json(orient="index") @pytest.mark.parametrize("numpy", [True, False]) - def test_series_roundtrip_simple(self, orient, numpy): - data = self.series.to_json(orient=orient) + def test_series_roundtrip_simple(self, orient, numpy, string_series): + data = string_series.to_json(orient=orient) result = pd.read_json(data, typ="series", orient=orient, numpy=numpy) - expected = self.series.copy() + expected = string_series if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": @@ -652,13 +610,13 @@ def test_series_roundtrip_simple(self, orient, numpy): @pytest.mark.parametrize("dtype", [False, None]) @pytest.mark.parametrize("numpy", [True, False]) - def test_series_roundtrip_object(self, orient, numpy, dtype): - data = self.objSeries.to_json(orient=orient) + def test_series_roundtrip_object(self, orient, numpy, dtype, object_series): + data = object_series.to_json(orient=orient) result = pd.read_json( data, typ="series", orient=orient, numpy=numpy, dtype=dtype ) - expected = self.objSeries.copy() + expected = object_series if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": @@ -667,12 +625,11 @@ def test_series_roundtrip_object(self, orient, numpy, dtype): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("numpy", [True, False]) - def test_series_roundtrip_empty(self, orient, numpy): - data = self.empty_series.to_json(orient=orient) + def test_series_roundtrip_empty(self, orient, numpy, empty_series): + data = empty_series.to_json(orient=orient) result = pd.read_json(data, typ="series", orient=orient, numpy=numpy) - expected = self.empty_series.copy() - # TODO: see what causes inconsistency + expected = empty_series if orient in ("values", "records"): expected = expected.reset_index(drop=True) else: @@ -681,11 +638,11 @@ def test_series_roundtrip_empty(self, orient, numpy): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("numpy", [True, False]) - def test_series_roundtrip_timeseries(self, orient, numpy): - data = self.ts.to_json(orient=orient) + def test_series_roundtrip_timeseries(self, orient, numpy, datetime_series): + data = datetime_series.to_json(orient=orient) result = pd.read_json(data, typ="series", orient=orient, numpy=numpy) - expected = self.ts.copy() + expected = datetime_series if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": @@ -761,35 +718,33 @@ def test_reconstruction_index(self): result = read_json(df.to_json()) tm.assert_frame_equal(result, df) - def test_path(self): + def test_path(self, float_frame, int_frame, datetime_frame): with tm.ensure_clean("test.json") as path: for df in [ - self.frame, - self.frame2, - self.intframe, - self.tsframe, - self.mixed_frame, + float_frame, + int_frame, + datetime_frame, ]: df.to_json(path) read_json(path) - def test_axis_dates(self): + def test_axis_dates(self, datetime_series, datetime_frame): # frame - json = self.tsframe.to_json() + json = datetime_frame.to_json() result = read_json(json) - tm.assert_frame_equal(result, self.tsframe) + tm.assert_frame_equal(result, datetime_frame) # series - json = self.ts.to_json() + json = datetime_series.to_json() result = read_json(json, typ="series") - tm.assert_series_equal(result, self.ts, check_names=False) + tm.assert_series_equal(result, datetime_series, check_names=False) assert result.name is None - def test_convert_dates(self): + def test_convert_dates(self, datetime_series, datetime_frame): # frame - df = self.tsframe.copy() + df = datetime_frame df["date"] = Timestamp("20130101") json = df.to_json() @@ -806,7 +761,7 @@ def test_convert_dates(self): tm.assert_frame_equal(result, expected) # series - ts = Series(Timestamp("20130101"), index=self.ts.index) + ts = Series(Timestamp("20130101"), index=datetime_series.index) json = ts.to_json() result = read_json(json, typ="series") tm.assert_series_equal(result, ts) @@ -869,8 +824,8 @@ def test_convert_dates_infer(self, infer_word): ("20130101 20:43:42.123456789", "ns"), ], ) - def test_date_format_frame(self, date, date_unit): - df = self.tsframe.copy() + def test_date_format_frame(self, date, date_unit, datetime_frame): + df = datetime_frame df["date"] = Timestamp(date) df.iloc[1, df.columns.get_loc("date")] = pd.NaT @@ -885,8 +840,8 @@ def test_date_format_frame(self, date, date_unit): expected["date"] = expected["date"].dt.tz_localize("UTC") tm.assert_frame_equal(result, expected) - def test_date_format_frame_raises(self): - df = self.tsframe.copy() + def test_date_format_frame_raises(self, datetime_frame): + df = datetime_frame msg = "Invalid value 'foo' for option 'date_unit'" with pytest.raises(ValueError, match=msg): df.to_json(date_format="iso", date_unit="foo") @@ -901,8 +856,8 @@ def test_date_format_frame_raises(self): ("20130101 20:43:42.123456789", "ns"), ], ) - def test_date_format_series(self, date, date_unit): - ts = Series(Timestamp(date), index=self.ts.index) + def test_date_format_series(self, date, date_unit, datetime_series): + ts = Series(Timestamp(date), index=datetime_series.index) ts.iloc[1] = pd.NaT ts.iloc[5] = pd.NaT if date_unit: @@ -915,15 +870,15 @@ def test_date_format_series(self, date, date_unit): expected = expected.dt.tz_localize("UTC") tm.assert_series_equal(result, expected) - def test_date_format_series_raises(self): - ts = Series(Timestamp("20130101 20:43:42.123"), index=self.ts.index) + def test_date_format_series_raises(self, datetime_series): + ts = Series(Timestamp("20130101 20:43:42.123"), index=datetime_series.index) msg = "Invalid value 'foo' for option 'date_unit'" with pytest.raises(ValueError, match=msg): ts.to_json(date_format="iso", date_unit="foo") @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) - def test_date_unit(self, unit): - df = self.tsframe.copy() + def test_date_unit(self, unit, datetime_frame): + df = datetime_frame df["date"] = Timestamp("20130101 20:43:42") dl = df.columns.get_loc("date") df.iloc[1, dl] = Timestamp("19710101 20:43:42") @@ -1058,6 +1013,29 @@ def test_mixed_timedelta_datetime(self): result = pd.read_json(frame.to_json(date_unit="ns"), dtype={"a": "int64"}) tm.assert_frame_equal(result, expected, check_index_type=False) + @pytest.mark.parametrize("as_object", [True, False]) + @pytest.mark.parametrize("date_format", ["iso", "epoch"]) + @pytest.mark.parametrize("timedelta_typ", [pd.Timedelta, timedelta]) + def test_timedelta_to_json(self, as_object, date_format, timedelta_typ): + # GH28156: to_json not correctly formatting Timedelta + data = [timedelta_typ(days=1), timedelta_typ(days=2), pd.NaT] + if as_object: + data.append("a") + + ser = pd.Series(data, index=data) + if date_format == "iso": + expected = ( + '{"P1DT0H0M0S":"P1DT0H0M0S","P2DT0H0M0S":"P2DT0H0M0S","null":null}' + ) + else: + expected = '{"86400000":86400000,"172800000":172800000,"null":null}' + + if as_object: + expected = expected.replace("}", ',"a":"a"}') + + result = ser.to_json(date_format=date_format) + assert result == expected + def test_default_handler(self): value = object() frame = DataFrame({"a": [7, value]}) @@ -1681,3 +1659,9 @@ def test_json_pandas_nulls(self, nulls_fixture): # GH 31615 result = pd.DataFrame([[nulls_fixture]]).to_json() assert result == '{"0":{"0":null}}' + + def test_readjson_bool_series(self): + # GH31464 + result = read_json("[true, true, false]", typ="series") + expected = pd.Series([True, True, False]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index e966db7a1cc71..34dd9ba9bc7b6 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -16,7 +16,7 @@ from pandas._libs.tslib import Timestamp import pandas.compat as compat -from pandas import DataFrame, DatetimeIndex, Index, NaT, Series, date_range +from pandas import DataFrame, DatetimeIndex, Index, NaT, Series, Timedelta, date_range import pandas._testing as tm @@ -48,6 +48,18 @@ def numpy(request): return request.param +def get_int32_compat_dtype(numpy, orient): + # See GH#32527 + dtype = np.int64 + if not ((numpy is None or orient == "index") or (numpy is True and orient is None)): + if compat.is_platform_windows(): + dtype = np.int32 + else: + dtype = np.intp + + return dtype + + class TestUltraJSONTests: @pytest.mark.skipif( compat.is_platform_32bit(), reason="not compliant on 32-bit, xref #15865" @@ -833,13 +845,20 @@ def test_dataframe(self, orient, numpy): if orient == "records" and numpy: pytest.skip("Not idiomatic pandas") + dtype = get_int32_compat_dtype(numpy, orient) + df = DataFrame( - [[1, 2, 3], [4, 5, 6]], index=["a", "b"], columns=["x", "y", "z"] + [[1, 2, 3], [4, 5, 6]], + index=["a", "b"], + columns=["x", "y", "z"], + dtype=dtype, ) encode_kwargs = {} if orient is None else dict(orient=orient) decode_kwargs = {} if numpy is None else dict(numpy=numpy) + assert (df.dtypes == dtype).all() output = ujson.decode(ujson.encode(df, **encode_kwargs), **decode_kwargs) + assert (df.dtypes == dtype).all() # Ensure proper DataFrame initialization. if orient == "split": @@ -857,7 +876,8 @@ def test_dataframe(self, orient, numpy): elif orient == "index": df = df.transpose() - tm.assert_frame_equal(output, df, check_dtype=False) + assert (df.dtypes == dtype).all() + tm.assert_frame_equal(output, df) def test_dataframe_nested(self, orient): df = DataFrame( @@ -897,14 +917,20 @@ def test_dataframe_numpy_labelled(self, orient): tm.assert_frame_equal(output, df) def test_series(self, orient, numpy): + dtype = get_int32_compat_dtype(numpy, orient) s = Series( - [10, 20, 30, 40, 50, 60], name="series", index=[6, 7, 8, 9, 10, 15] + [10, 20, 30, 40, 50, 60], + name="series", + index=[6, 7, 8, 9, 10, 15], + dtype=dtype, ).sort_values() + assert s.dtype == dtype encode_kwargs = {} if orient is None else dict(orient=orient) decode_kwargs = {} if numpy is None else dict(numpy=numpy) output = ujson.decode(ujson.encode(s, **encode_kwargs), **decode_kwargs) + assert s.dtype == dtype if orient == "split": dec = _clean_dict(output) @@ -920,7 +946,8 @@ def test_series(self, orient, numpy): s.name = None s.index = [0, 1, 2, 3, 4, 5] - tm.assert_series_equal(output, s, check_dtype=False) + assert s.dtype == dtype + tm.assert_series_equal(output, s) def test_series_nested(self, orient): s = Series( @@ -1076,3 +1103,24 @@ def test_encode_set(self): for v in dec: assert v in s + + @pytest.mark.parametrize( + "td", + [ + Timedelta(days=366), + Timedelta(days=-1), + Timedelta(hours=13, minutes=5, seconds=5), + Timedelta(hours=13, minutes=20, seconds=30), + Timedelta(days=-1, nanoseconds=5), + Timedelta(nanoseconds=1), + Timedelta(microseconds=1, nanoseconds=1), + Timedelta(milliseconds=1, microseconds=1, nanoseconds=1), + Timedelta(milliseconds=999, microseconds=999, nanoseconds=999), + ], + ) + def test_encode_timedelta_iso(self, td): + # GH 28256 + result = ujson.encode(td, iso_dates=True) + expected = f'"{td.isoformat()}"' + + assert result == expected diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index b3aa1aa14a509..5bf9587a6ca22 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -15,6 +15,7 @@ from pandas._libs.tslib import Timestamp from pandas.errors import DtypeWarning, EmptyDataError, ParserError +import pandas.util._test_decorators as td from pandas import DataFrame, Index, MultiIndex, Series, compat, concat import pandas._testing as tm @@ -959,13 +960,23 @@ def test_nonexistent_path(all_parsers): parser = all_parsers path = f"{tm.rands(10)}.csv" - msg = f"File {path} does not exist" if parser.engine == "c" else r"\[Errno 2\]" + msg = r"\[Errno 2\]" with pytest.raises(FileNotFoundError, match=msg) as e: parser.read_csv(path) + assert path == e.value.filename - filename = e.value.filename - assert path == filename +@td.skip_if_windows # os.chmod does not work in windows +def test_no_permission(all_parsers): + # GH 23784 + parser = all_parsers + + msg = r"\[Errno 13\]" + with tm.ensure_clean() as path: + os.chmod(path, 0) # make file unreadable + with pytest.raises(PermissionError, match=msg) as e: + parser.read_csv(path) + assert path == e.value.filename def test_missing_trailing_delimiters(all_parsers): @@ -1062,14 +1073,14 @@ def test_escapechar(all_parsers): data = '''SEARCH_TERM,ACTUAL_URL "bra tv bord","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" "tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" -"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals serie","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' # noqa +"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals series","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' # noqa parser = all_parsers result = parser.read_csv( StringIO(data), escapechar="\\", quotechar='"', encoding="utf-8" ) - assert result["SEARCH_TERM"][2] == 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals serie' + assert result["SEARCH_TERM"][2] == 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals series' tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"])) @@ -2079,3 +2090,39 @@ def test_integer_precision(all_parsers): result = parser.read_csv(StringIO(s), header=None)[4] expected = Series([4321583677327450765, 4321113141090630389], name=4) tm.assert_series_equal(result, expected) + + +def test_file_descriptor_leak(all_parsers): + # GH 31488 + + parser = all_parsers + with tm.ensure_clean() as path: + + def test(): + with pytest.raises(EmptyDataError, match="No columns to parse from file"): + parser.read_csv(path) + + td.check_file_leaks(test)() + + +@pytest.mark.parametrize("nrows", range(1, 6)) +def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): + # GH 28071 + ref = DataFrame( + [[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]], + columns=list("ab"), + ) + csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4" + parser = all_parsers + df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False) + tm.assert_frame_equal(df, ref[:nrows]) + + +def test_no_header_two_extra_columns(all_parsers): + # GH 26218 + column_names = ["one", "two", "three"] + ref = DataFrame([["foo", "bar", "baz"]], columns=column_names) + stream = StringIO("foo,bar,baz,bam,blah") + parser = all_parsers + df = parser.read_csv(stream, header=None, names=column_names, index_col=False) + tm.assert_frame_equal(df, ref) diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index 11dcf7f04f76b..e68dcb3aa577e 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -192,7 +192,7 @@ def test_categorical_dtype_utf16(all_parsers, csv_dir_path): pth = os.path.join(csv_dir_path, "utf16_ex.txt") parser = all_parsers encoding = "utf-16" - sep = "," + sep = "\t" expected = parser.read_csv(pth, sep=sep, encoding=encoding) expected = expected.apply(Categorical) diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 3661e4e056db2..13b74cf29f857 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -5,6 +5,7 @@ from io import BytesIO import os +import tempfile import numpy as np import pytest @@ -174,3 +175,25 @@ def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding) result = parser.read_csv(f, encoding=encoding if pass_encoding else None) tm.assert_frame_equal(result, expected) + + +def test_encoding_named_temp_file(all_parsers): + # see gh-31819 + parser = all_parsers + encoding = "shift-jis" + + if parser.engine == "python": + pytest.skip("NamedTemporaryFile does not work with Python engine") + + title = "てすと" + data = "こむ" + + expected = DataFrame({title: [data]}) + + with tempfile.NamedTemporaryFile() as f: + f.write(f"{title}\n{data}".encode(encoding)) + + f.seek(0) + + result = parser.read_csv(f, encoding=encoding) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 31573e4e6ecce..2fcac6fa57cf8 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1516,3 +1516,35 @@ def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_dateti assert except_out_dateutil == except_in_dateutil assert result == expected + + +@pytest.mark.parametrize( + "names, usecols, parse_dates, missing_cols", + [ + (None, ["val"], ["date", "time"], "date, time"), + (None, ["val"], [0, "time"], "time"), + (None, ["val"], [["date", "time"]], "date, time"), + (None, ["val"], [[0, "time"]], "time"), + (None, ["val"], {"date": [0, "time"]}, "time"), + (None, ["val"], {"date": ["date", "time"]}, "date, time"), + (None, ["val"], [["date", "time"], "date"], "date, time"), + (["date1", "time1", "temperature"], None, ["date", "time"], "date, time"), + ( + ["date1", "time1", "temperature"], + ["date1", "temperature"], + ["date1", "time"], + "time", + ), + ], +) +def test_missing_parse_dates_column_raises( + all_parsers, names, usecols, parse_dates, missing_cols +): + # gh-31251 column names provided in parse_dates could be missing. + parser = all_parsers + content = StringIO("date,time,val\n2020-01-31,04:20:32,32\n") + msg = f"Missing column provided to 'parse_dates': '{missing_cols}'" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates, + ) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 7367b19b40dc3..4d933fa02d36f 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -66,6 +66,24 @@ def test_sniff_delimiter(python_parser_only, kwargs): tm.assert_frame_equal(result, expected) +def test_sniff_delimiter_comment(python_parser_only): + data = """# comment line +index|A|B|C +# comment line +foo|1|2|3 # ignore | this +bar|4|5|6 +baz|7|8|9 +""" + parser = python_parser_only + result = parser.read_csv(StringIO(data), index_col=0, sep=None, comment="#") + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=["A", "B", "C"], + index=Index(["foo", "bar", "baz"], name="index"), + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("encoding", [None, "utf-8"]) def test_sniff_delimiter_encoding(python_parser_only, encoding): parser = python_parser_only diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index fd585a73f6ce6..536f4aa760b9c 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -13,8 +13,6 @@ from pandas.compat import is_platform_little_endian, is_platform_windows import pandas.util._test_decorators as td -from pandas.core.dtypes.common import is_categorical_dtype - import pandas as pd from pandas import ( Categorical, @@ -342,7 +340,7 @@ def test_repr(self, setup_path): df["timestamp2"] = Timestamp("20010103") df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) - df.loc[3:6, ["obj1"]] = np.nan + df.loc[df.index[3:6], ["obj1"]] = np.nan df = df._consolidate()._convert(datetime=True) with catch_warnings(record=True): @@ -846,7 +844,7 @@ def test_put_mixed_type(self, setup_path): df["timestamp2"] = Timestamp("20010103") df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) - df.loc[3:6, ["obj1"]] = np.nan + df.loc[df.index[3:6], ["obj1"]] = np.nan df = df._consolidate()._convert(datetime=True) with ensure_clean_store(setup_path) as store: @@ -1057,18 +1055,7 @@ def test_latin_encoding(self, setup_path, dtype, val): s_nan = ser.replace(nan_rep, np.nan) - if is_categorical_dtype(s_nan): - assert is_categorical_dtype(retr) - tm.assert_series_equal( - s_nan, retr, check_dtype=False, check_categorical=False - ) - else: - tm.assert_series_equal(s_nan, retr) - - # FIXME: don't leave commented-out - # fails: - # for x in examples: - # roundtrip(s, nan_rep=b'\xf8\xfc') + tm.assert_series_equal(s_nan, retr) def test_append_some_nans(self, setup_path): @@ -1230,14 +1217,14 @@ def test_read_missing_key_opened_store(self, setup_path): df = pd.DataFrame({"a": range(2), "b": range(2)}) df.to_hdf(path, "k1") - store = pd.HDFStore(path, "r") + with pd.HDFStore(path, "r") as store: - with pytest.raises(KeyError, match="'No object named k2 in the file'"): - pd.read_hdf(store, "k2") + with pytest.raises(KeyError, match="'No object named k2 in the file'"): + pd.read_hdf(store, "k2") - # Test that the file is still open after a KeyError and that we can - # still read from it. - pd.read_hdf(store, "k1") + # Test that the file is still open after a KeyError and that we can + # still read from it. + pd.read_hdf(store, "k1") def test_append_frame_column_oriented(self, setup_path): with ensure_clean_store(setup_path) as store: @@ -1372,11 +1359,11 @@ def check_col(key, name, size): _maybe_remove(store, "df") df = tm.makeTimeDataFrame() df["string"] = "foo" - df.loc[1:4, "string"] = np.nan + df.loc[df.index[1:4], "string"] = np.nan df["string2"] = "bar" - df.loc[4:8, "string2"] = np.nan + df.loc[df.index[4:8], "string2"] = np.nan df["string3"] = "bah" - df.loc[1:, "string3"] = np.nan + df.loc[df.index[1:], "string3"] = np.nan store.append("df", df) result = store.select("df") tm.assert_frame_equal(result, df) @@ -1492,8 +1479,8 @@ def test_append_with_data_columns(self, setup_path): # data column selection with a string data_column df_new = df.copy() df_new["string"] = "foo" - df_new.loc[1:4, "string"] = np.nan - df_new.loc[5:6, "string"] = "bar" + df_new.loc[df_new.index[1:4], "string"] = np.nan + df_new.loc[df_new.index[5:6], "string"] = "bar" _maybe_remove(store, "df") store.append("df", df_new, data_columns=["string"]) result = store.select("df", "string='foo'") @@ -1574,12 +1561,12 @@ def check_col(key, name, size): # doc example df_dc = df.copy() df_dc["string"] = "foo" - df_dc.loc[4:6, "string"] = np.nan - df_dc.loc[7:9, "string"] = "bar" + df_dc.loc[df_dc.index[4:6], "string"] = np.nan + df_dc.loc[df_dc.index[7:9], "string"] = "bar" df_dc["string2"] = "cool" df_dc["datetime"] = Timestamp("20010102") df_dc = df_dc._convert(datetime=True) - df_dc.loc[3:5, ["A", "B", "datetime"]] = np.nan + df_dc.loc[df_dc.index[3:5], ["A", "B", "datetime"]] = np.nan _maybe_remove(store, "df_dc") store.append( @@ -1602,8 +1589,8 @@ def check_col(key, name, size): np.random.randn(8, 3), index=index, columns=["A", "B", "C"] ) df_dc["string"] = "foo" - df_dc.loc[4:6, "string"] = np.nan - df_dc.loc[7:9, "string"] = "bar" + df_dc.loc[df_dc.index[4:6], "string"] = np.nan + df_dc.loc[df_dc.index[7:9], "string"] = "bar" df_dc.loc[:, ["B", "C"]] = df_dc.loc[:, ["B", "C"]].abs() df_dc["string2"] = "cool" @@ -2024,7 +2011,7 @@ def test_table_mixed_dtypes(self, setup_path): df["timestamp2"] = Timestamp("20010103") df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) - df.loc[3:6, ["obj1"]] = np.nan + df.loc[df.index[3:6], ["obj1"]] = np.nan df = df._consolidate()._convert(datetime=True) with ensure_clean_store(setup_path) as store: @@ -2200,7 +2187,7 @@ def test_invalid_terms(self, setup_path): df = tm.makeTimeDataFrame() df["string"] = "foo" - df.loc[0:4, "string"] = "bar" + df.loc[df.index[0:4], "string"] = "bar" store.put("df", df, format="table") @@ -2311,9 +2298,7 @@ def test_index_types(self, setup_path): with catch_warnings(record=True): values = np.random.randn(2) - func = lambda l, r: tm.assert_series_equal( - l, r, check_dtype=True, check_index_type=True, check_series_type=True - ) + func = lambda l, r: tm.assert_series_equal(l, r, check_index_type=True) with catch_warnings(record=True): ser = Series(values, [0, "y"]) @@ -2397,7 +2382,7 @@ def test_frame(self, compression, setup_path): df["foo"] = np.random.randn(len(df)) store["df"] = df recons = store["df"] - assert recons._data.is_consolidated() + assert recons._mgr.is_consolidated() # empty self._check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path) @@ -3343,7 +3328,7 @@ def test_string_select(self, setup_path): # test string ==/!= df["x"] = "none" - df.loc[2:7, "x"] = "" + df.loc[df.index[2:7], "x"] = "" store.append("df", df, data_columns=["x"]) @@ -3365,7 +3350,7 @@ def test_string_select(self, setup_path): # int ==/!= df["int"] = 1 - df.loc[2:7, "int"] = 2 + df.loc[df.index[2:7], "int"] = 2 store.append("df3", df, data_columns=["int"]) @@ -3419,7 +3404,7 @@ def test_read_column(self, setup_path): # a data column with NaNs, result excludes the NaNs df3 = df.copy() df3["string"] = "foo" - df3.loc[4:6, "string"] = np.nan + df3.loc[df3.index[4:6], "string"] = np.nan store.append("df3", df3, data_columns=["string"]) result = store.select_column("df3", "string") tm.assert_almost_equal(result.values, df3["string"].values) diff --git a/pandas/tests/io/sas/test_xport.py b/pandas/tests/io/sas/test_xport.py index ee97f08ef9400..2682bafedb8f1 100644 --- a/pandas/tests/io/sas/test_xport.py +++ b/pandas/tests/io/sas/test_xport.py @@ -26,6 +26,7 @@ def setup_method(self, datapath): self.dirpath = datapath("io", "sas", "data") self.file01 = os.path.join(self.dirpath, "DEMO_G.xpt") self.file02 = os.path.join(self.dirpath, "SSHSV1_A.xpt") + self.file02b = open(os.path.join(self.dirpath, "SSHSV1_A.xpt"), "rb") self.file03 = os.path.join(self.dirpath, "DRXFCD_G.xpt") self.file04 = os.path.join(self.dirpath, "paxraw_d_short.xpt") @@ -119,6 +120,16 @@ def test2(self): data = read_sas(self.file02) tm.assert_frame_equal(data, data_csv) + def test2_binary(self): + # Test with SSHSV1_A.xpt, read as a binary file + + # Compare to this + data_csv = pd.read_csv(self.file02.replace(".xpt", ".csv")) + numeric_as_float(data_csv) + + data = read_sas(self.file02b, format="xport") + tm.assert_frame_equal(data, data_csv) + def test_multiple_types(self): # Test with DRXFCD_G.xpt (contains text and numeric variables) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 3458cfb6ad254..b627e0e1cad54 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -75,7 +75,11 @@ def df(request): ) elif data_type == "mixed": return DataFrame( - {"a": np.arange(1.0, 6.0) + 0.01, "b": np.arange(1, 6), "c": list("abcde")} + { + "a": np.arange(1.0, 6.0) + 0.01, + "b": np.arange(1, 6).astype(np.int64), + "c": list("abcde"), + } ) elif data_type == "float": return tm.makeCustomDataframe( @@ -146,7 +150,7 @@ class TestClipboard: def check_round_trip_frame(self, data, excel=None, sep=None, encoding=None): data.to_clipboard(excel=excel, sep=sep, encoding=encoding) result = read_clipboard(sep=sep or "\t", index_col=0, encoding=encoding) - tm.assert_frame_equal(data, result, check_dtype=False) + tm.assert_frame_equal(data, result) # Test that default arguments copy as tab delimited def test_round_trip_frame(self, df): diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 730043e6ec7d7..84bc29ebc65e0 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -137,11 +137,11 @@ def test_iterator(self): (pd.read_pickle, "os", FileNotFoundError, "pickle"), ], ) - def test_read_non_existant(self, reader, module, error_class, fn_ext): + def test_read_non_existent(self, reader, module, error_class, fn_ext): pytest.importorskip(module) path = os.path.join(HERE, "data", "does_not_exist." + fn_ext) - msg1 = r"File (b')?.+does_not_exist\.{}'? does not exist".format(fn_ext) + msg1 = fr"File (b')?.+does_not_exist\.{fn_ext}'? does not exist" msg2 = fr"\[Errno 2\] No such file or directory: '.+does_not_exist\.{fn_ext}'" msg3 = "Expected object or value" msg4 = "path_or_buf needs to be a string file path or file-like" diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 841241d5124e0..59c9bd0a36d3d 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -143,3 +143,44 @@ def test_with_missing_lzma_runtime(): """ ) subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE) + + +@pytest.mark.parametrize( + "obj", + [ + pd.DataFrame( + 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + columns=["X", "Y", "Z"], + ), + pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"), + ], +) +@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"]) +def test_gzip_compression_level(obj, method): + # GH33196 + with tm.ensure_clean() as path: + getattr(obj, method)(path, compression="gzip") + compressed_size_default = os.path.getsize(path) + getattr(obj, method)(path, compression={"method": "gzip", "compresslevel": 1}) + compressed_size_fast = os.path.getsize(path) + assert compressed_size_default < compressed_size_fast + + +@pytest.mark.parametrize( + "obj", + [ + pd.DataFrame( + 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + columns=["X", "Y", "Z"], + ), + pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"), + ], +) +@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"]) +def test_bzip_compression_level(obj, method): + """GH33196 bzip needs file size > 100k to show a size difference between + compression levels, so here we just check if the call works when + compression is passed as a dict. + """ + with tm.ensure_clean() as path: + getattr(obj, method)(path, compression={"method": "bz2", "compresslevel": 1}) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 0038df78dd866..0755501ee6285 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -4,6 +4,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm @@ -27,15 +29,15 @@ def check_error_on_write(self, df, exc): with tm.ensure_clean() as path: to_feather(df, path) - def check_round_trip(self, df, expected=None, **kwargs): + def check_round_trip(self, df, expected=None, write_kwargs={}, **read_kwargs): if expected is None: expected = df with tm.ensure_clean() as path: - to_feather(df, path) + to_feather(df, path, **write_kwargs) - result = read_feather(path, **kwargs) + result = read_feather(path, **read_kwargs) tm.assert_frame_equal(result, expected) def test_error(self): @@ -71,6 +73,10 @@ def test_basic(self): "dtns": pd.date_range("20130101", periods=3, freq="ns"), } ) + if pyarrow_version >= LooseVersion("0.16.1.dev"): + df["periods"] = pd.period_range("2013", freq="M", periods=3) + df["timedeltas"] = pd.timedelta_range("1 day", periods=3) + df["intervals"] = pd.interval_range(0, 3, 3) assert df.dttz.dtype.tz.zone == "US/Eastern" self.check_round_trip(df) @@ -102,8 +108,8 @@ def test_read_columns(self): def test_unsupported_other(self): - # period - df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) + # mixed python objects + df = pd.DataFrame({"a": ["a", 1, 2.0]}) # Some versions raise ValueError, others raise ArrowInvalid. self.check_error_on_write(df, Exception) @@ -148,3 +154,8 @@ def test_path_localpath(self): df = tm.makeDataFrame().reset_index() result = tm.round_trip_localpath(df.to_feather, pd.read_feather) tm.assert_frame_equal(df, result) + + @td.skip_if_no("pyarrow", min_version="0.16.1.dev") + def test_passthrough_keywords(self): + df = tm.makeDataFrame().reset_index() + self.check_round_trip(df, write_kwargs=dict(version=1)) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index cbaf16d048eda..2c93dbb5b6b83 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -72,7 +72,7 @@ def test_invalid_flavor(): msg = r"\{" + flavor + r"\} is not a valid set of flavors" with pytest.raises(ValueError, match=msg): - read_html(url, "google", flavor=flavor) + read_html(url, match="google", flavor=flavor) @td.skip_if_no("bs4") @@ -121,13 +121,26 @@ def test_to_html_compat(self): res = self.read_html(out, attrs={"class": "dataframe"}, index_col=0)[0] tm.assert_frame_equal(res, df) + @tm.network + def test_banklist_url_positional_match(self): + url = "http://www.fdic.gov/bank/individual/failed/banklist.html" + # Passing match argument as positional should cause a FutureWarning. + with tm.assert_produces_warning(FutureWarning): + df1 = self.read_html( + url, "First Federal Bank of Florida", attrs={"id": "table"} + ) + with tm.assert_produces_warning(FutureWarning): + df2 = self.read_html(url, "Metcalf Bank", attrs={"id": "table"}) + + assert_framelist_equal(df1, df2) + @tm.network def test_banklist_url(self): url = "http://www.fdic.gov/bank/individual/failed/banklist.html" df1 = self.read_html( - url, "First Federal Bank of Florida", attrs={"id": "table"} + url, match="First Federal Bank of Florida", attrs={"id": "table"} ) - df2 = self.read_html(url, "Metcalf Bank", attrs={"id": "table"}) + df2 = self.read_html(url, match="Metcalf Bank", attrs={"id": "table"}) assert_framelist_equal(df1, df2) @@ -137,21 +150,25 @@ def test_spam_url(self): "https://raw.githubusercontent.com/pandas-dev/pandas/master/" "pandas/tests/io/data/html/spam.html" ) - df1 = self.read_html(url, ".*Water.*") - df2 = self.read_html(url, "Unit") + df1 = self.read_html(url, match=".*Water.*") + df2 = self.read_html(url, match="Unit") assert_framelist_equal(df1, df2) @pytest.mark.slow def test_banklist(self): - df1 = self.read_html(self.banklist_data, ".*Florida.*", attrs={"id": "table"}) - df2 = self.read_html(self.banklist_data, "Metcalf Bank", attrs={"id": "table"}) + df1 = self.read_html( + self.banklist_data, match=".*Florida.*", attrs={"id": "table"} + ) + df2 = self.read_html( + self.banklist_data, match="Metcalf Bank", attrs={"id": "table"} + ) assert_framelist_equal(df1, df2) def test_spam(self): - df1 = self.read_html(self.spam_data, ".*Water.*") - df2 = self.read_html(self.spam_data, "Unit") + df1 = self.read_html(self.spam_data, match=".*Water.*") + df2 = self.read_html(self.spam_data, match="Unit") assert_framelist_equal(df1, df2) assert df1[0].iloc[0, 0] == "Proximates" @@ -168,81 +185,82 @@ def test_banklist_no_match(self): assert isinstance(df, DataFrame) def test_spam_header(self): - df = self.read_html(self.spam_data, ".*Water.*", header=2)[0] + df = self.read_html(self.spam_data, match=".*Water.*", header=2)[0] assert df.columns[0] == "Proximates" assert not df.empty def test_skiprows_int(self): - df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=1) - df2 = self.read_html(self.spam_data, "Unit", skiprows=1) + df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=1) + df2 = self.read_html(self.spam_data, match="Unit", skiprows=1) assert_framelist_equal(df1, df2) def test_skiprows_range(self): - df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=range(2))[0] - df2 = self.read_html(self.spam_data, "Unit", skiprows=range(2))[0] - tm.assert_frame_equal(df1, df2) + df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=range(2)) + df2 = self.read_html(self.spam_data, match="Unit", skiprows=range(2)) + + assert_framelist_equal(df1, df2) def test_skiprows_list(self): - df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=[1, 2]) - df2 = self.read_html(self.spam_data, "Unit", skiprows=[2, 1]) + df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=[1, 2]) + df2 = self.read_html(self.spam_data, match="Unit", skiprows=[2, 1]) assert_framelist_equal(df1, df2) def test_skiprows_set(self): - df1 = self.read_html(self.spam_data, ".*Water.*", skiprows={1, 2}) - df2 = self.read_html(self.spam_data, "Unit", skiprows={2, 1}) + df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows={1, 2}) + df2 = self.read_html(self.spam_data, match="Unit", skiprows={2, 1}) assert_framelist_equal(df1, df2) def test_skiprows_slice(self): - df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=1) - df2 = self.read_html(self.spam_data, "Unit", skiprows=1) + df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=1) + df2 = self.read_html(self.spam_data, match="Unit", skiprows=1) assert_framelist_equal(df1, df2) def test_skiprows_slice_short(self): - df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=slice(2)) - df2 = self.read_html(self.spam_data, "Unit", skiprows=slice(2)) + df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=slice(2)) + df2 = self.read_html(self.spam_data, match="Unit", skiprows=slice(2)) assert_framelist_equal(df1, df2) def test_skiprows_slice_long(self): - df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=slice(2, 5)) - df2 = self.read_html(self.spam_data, "Unit", skiprows=slice(4, 1, -1)) + df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=slice(2, 5)) + df2 = self.read_html(self.spam_data, match="Unit", skiprows=slice(4, 1, -1)) assert_framelist_equal(df1, df2) def test_skiprows_ndarray(self): - df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=np.arange(2)) - df2 = self.read_html(self.spam_data, "Unit", skiprows=np.arange(2)) + df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=np.arange(2)) + df2 = self.read_html(self.spam_data, match="Unit", skiprows=np.arange(2)) assert_framelist_equal(df1, df2) def test_skiprows_invalid(self): with pytest.raises(TypeError, match=("is not a valid type for skipping rows")): - self.read_html(self.spam_data, ".*Water.*", skiprows="asdf") + self.read_html(self.spam_data, match=".*Water.*", skiprows="asdf") def test_index(self): - df1 = self.read_html(self.spam_data, ".*Water.*", index_col=0) - df2 = self.read_html(self.spam_data, "Unit", index_col=0) + df1 = self.read_html(self.spam_data, match=".*Water.*", index_col=0) + df2 = self.read_html(self.spam_data, match="Unit", index_col=0) assert_framelist_equal(df1, df2) def test_header_and_index_no_types(self): - df1 = self.read_html(self.spam_data, ".*Water.*", header=1, index_col=0) - df2 = self.read_html(self.spam_data, "Unit", header=1, index_col=0) + df1 = self.read_html(self.spam_data, match=".*Water.*", header=1, index_col=0) + df2 = self.read_html(self.spam_data, match="Unit", header=1, index_col=0) assert_framelist_equal(df1, df2) def test_header_and_index_with_types(self): - df1 = self.read_html(self.spam_data, ".*Water.*", header=1, index_col=0) - df2 = self.read_html(self.spam_data, "Unit", header=1, index_col=0) + df1 = self.read_html(self.spam_data, match=".*Water.*", header=1, index_col=0) + df2 = self.read_html(self.spam_data, match="Unit", header=1, index_col=0) assert_framelist_equal(df1, df2) def test_infer_types(self): # 10892 infer_types removed - df1 = self.read_html(self.spam_data, ".*Water.*", index_col=0) - df2 = self.read_html(self.spam_data, "Unit", index_col=0) + df1 = self.read_html(self.spam_data, match=".*Water.*", index_col=0) + df2 = self.read_html(self.spam_data, match="Unit", index_col=0) assert_framelist_equal(df1, df2) def test_string_io(self): @@ -252,25 +270,25 @@ def test_string_io(self): with open(self.spam_data, **self.spam_data_kwargs) as f: data2 = StringIO(f.read()) - df1 = self.read_html(data1, ".*Water.*") - df2 = self.read_html(data2, "Unit") + df1 = self.read_html(data1, match=".*Water.*") + df2 = self.read_html(data2, match="Unit") assert_framelist_equal(df1, df2) def test_string(self): with open(self.spam_data, **self.spam_data_kwargs) as f: data = f.read() - df1 = self.read_html(data, ".*Water.*") - df2 = self.read_html(data, "Unit") + df1 = self.read_html(data, match=".*Water.*") + df2 = self.read_html(data, match="Unit") assert_framelist_equal(df1, df2) def test_file_like(self): with open(self.spam_data, **self.spam_data_kwargs) as f: - df1 = self.read_html(f, ".*Water.*") + df1 = self.read_html(f, match=".*Water.*") with open(self.spam_data, **self.spam_data_kwargs) as f: - df2 = self.read_html(f, "Unit") + df2 = self.read_html(f, match="Unit") assert_framelist_equal(df1, df2) @@ -292,7 +310,7 @@ def test_invalid_url(self): def test_file_url(self): url = self.banklist_data dfs = self.read_html( - file_path_to_url(os.path.abspath(url)), "First", attrs={"id": "table"} + file_path_to_url(os.path.abspath(url)), match="First", attrs={"id": "table"} ) assert isinstance(dfs, list) for df in dfs: @@ -303,12 +321,12 @@ def test_invalid_table_attrs(self): url = self.banklist_data with pytest.raises(ValueError, match="No tables found"): self.read_html( - url, "First Federal Bank of Florida", attrs={"id": "tasdfable"} + url, match="First Federal Bank of Florida", attrs={"id": "tasdfable"} ) def _bank_data(self, *args, **kwargs): return self.read_html( - self.banklist_data, "Metcalf", attrs={"id": "table"}, *args, **kwargs + self.banklist_data, match="Metcalf", attrs={"id": "table"}, *args, **kwargs ) @pytest.mark.slow @@ -358,7 +376,7 @@ def test_regex_idempotency(self): def test_negative_skiprows(self): msg = r"\(you passed a negative value\)" with pytest.raises(ValueError, match=msg): - self.read_html(self.spam_data, "Water", skiprows=-1) + self.read_html(self.spam_data, match="Water", skiprows=-1) @tm.network def test_multiple_matches(self): @@ -555,7 +573,9 @@ def try_remove_ws(x): except AttributeError: return x - df = self.read_html(self.banklist_data, "Metcalf", attrs={"id": "table"})[0] + df = self.read_html(self.banklist_data, match="Metcalf", attrs={"id": "table"})[ + 0 + ] ground_truth = read_csv( datapath("io", "data", "csv", "banklist.csv"), converters={"Updated Date": Timestamp, "Closing Date": Timestamp}, @@ -600,7 +620,9 @@ def test_gold_canyon(self): raw_text = f.read() assert gc in raw_text - df = self.read_html(self.banklist_data, "Gold Canyon", attrs={"id": "table"})[0] + df = self.read_html( + self.banklist_data, match="Gold Canyon", attrs={"id": "table"} + )[0] assert gc in df.to_string() def test_different_number_of_cols(self): @@ -855,7 +877,7 @@ def test_wikipedia_states_table(self, datapath): data = datapath("io", "data", "html", "wikipedia_states.html") assert os.path.isfile(data), f"{repr(data)} is not a file" assert os.path.getsize(data), f"{repr(data)} is an empty file" - result = self.read_html(data, "Arizona", header=1)[0] + result = self.read_html(data, match="Arizona", header=1)[0] assert result.shape == (60, 12) assert "Unnamed" in result.columns[-1] assert result["sq mi"].dtype == np.dtype("float64") @@ -863,7 +885,7 @@ def test_wikipedia_states_table(self, datapath): def test_wikipedia_states_multiindex(self, datapath): data = datapath("io", "data", "html", "wikipedia_states.html") - result = self.read_html(data, "Arizona", index_col=0)[0] + result = self.read_html(data, match="Arizona", index_col=0)[0] assert result.shape == (60, 11) assert "Unnamed" in result.columns[-1][1] assert result.columns.nlevels == 2 @@ -1065,7 +1087,7 @@ def test_works_on_valid_markup(self, datapath): @pytest.mark.slow def test_fallback_success(self, datapath): banklist_data = datapath("io", "data", "html", "banklist.html") - self.read_html(banklist_data, ".*Water.*", flavor=["lxml", "html5lib"]) + self.read_html(banklist_data, match=".*Water.*", flavor=["lxml", "html5lib"]) def test_to_html_timestamp(self): rng = date_range("2000-01-01", periods=10) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index cfcf617cedf9c..94cf16c20e6c4 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1,6 +1,7 @@ """ test parquet compat """ import datetime from distutils.version import LooseVersion +import locale import os from warnings import catch_warnings @@ -34,6 +35,7 @@ except ImportError: _HAVE_FASTPARQUET = False + pytestmark = pytest.mark.filterwarnings( "ignore:RangeIndex.* is deprecated:DeprecationWarning" ) @@ -222,6 +224,49 @@ def test_options_get_engine(fp, pa): assert isinstance(get_engine("fastparquet"), FastParquetImpl) +def test_get_engine_auto_error_message(): + # Expect different error messages from get_engine(engine="auto") + # if engines aren't installed vs. are installed but bad version + from pandas.compat._optional import VERSIONS + + # Do we have engines installed, but a bad version of them? + pa_min_ver = VERSIONS.get("pyarrow") + fp_min_ver = VERSIONS.get("fastparquet") + have_pa_bad_version = ( + False + if not _HAVE_PYARROW + else LooseVersion(pyarrow.__version__) < LooseVersion(pa_min_ver) + ) + have_fp_bad_version = ( + False + if not _HAVE_FASTPARQUET + else LooseVersion(fastparquet.__version__) < LooseVersion(fp_min_ver) + ) + # Do we have usable engines installed? + have_usable_pa = _HAVE_PYARROW and not have_pa_bad_version + have_usable_fp = _HAVE_FASTPARQUET and not have_fp_bad_version + + if not have_usable_pa and not have_usable_fp: + # No usable engines found. + if have_pa_bad_version: + match = f"Pandas requires version .{pa_min_ver}. or newer of .pyarrow." + with pytest.raises(ImportError, match=match): + get_engine("auto") + else: + match = "Missing optional dependency .pyarrow." + with pytest.raises(ImportError, match=match): + get_engine("auto") + + if have_fp_bad_version: + match = f"Pandas requires version .{fp_min_ver}. or newer of .fastparquet." + with pytest.raises(ImportError, match=match): + get_engine("auto") + else: + match = "Missing optional dependency .fastparquet." + with pytest.raises(ImportError, match=match): + get_engine("auto") + + def test_cross_engine_pa_fp(df_cross_compat, pa, fp): # cross-compat with differing reading/writing engines @@ -483,6 +528,11 @@ def test_categorical(self, pa): expected = df.astype(object) check_round_trip(df, pa, expected=expected) + # GH#33077 2020-03-27 + @pytest.mark.xfail( + locale.getlocale()[0] == "zh_CN", + reason="dateutil cannot parse e.g. '五, 27 3月 2020 21:45:38 GMT'", + ) def test_s3_roundtrip(self, df_compat, s3_resource, pa): # GH #19134 check_round_trip(df_compat, pa, path="s3://pandas-test/pyarrow.parquet") diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index fc3876eee9d66..2f2ae8cd9d32b 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1480,6 +1480,14 @@ def test_datetime_with_timezone_roundtrip(self): result["A"] = to_datetime(result["A"]) tm.assert_frame_equal(result, expected) + def test_out_of_bounds_datetime(self): + # GH 26761 + data = pd.DataFrame({"date": datetime(9999, 1, 1)}, index=[0]) + data.to_sql("test_datetime_obb", self.conn, index=False) + result = sql.read_sql_table("test_datetime_obb", self.conn) + expected = pd.DataFrame([pd.NaT], columns=["date"]) + tm.assert_frame_equal(result, expected) + def test_naive_datetimeindex_roundtrip(self): # GH 23510 # Ensure that a naive DatetimeIndex isn't converted to UTC @@ -2633,6 +2641,8 @@ def test_write_row_by_row(self): result = sql.read_sql("select * from test", con=self.conn) result.index = frame.index tm.assert_frame_equal(result, frame, check_less_precise=True) + # GH#32571 result comes back rounded to 6 digits in some builds; + # no obvious pattern def test_chunksize_read_type(self): frame = tm.makeTimeDataFrame() diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index b65efac2bd527..eaa92fa53d799 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -254,12 +254,21 @@ def test_read_dta4(self, file): ) # these are all categoricals - expected = pd.concat( - [expected[col].astype("category") for col in expected], axis=1 - ) + for col in expected: + orig = expected[col].copy() + + categories = np.asarray(expected["fully_labeled"][orig.notna()]) + if col == "incompletely_labeled": + categories = orig + + cat = orig.astype("category")._values + cat = cat.set_categories(categories, ordered=True) + cat.categories.rename(None, inplace=True) + + expected[col] = cat # stata doesn't save .category metadata - tm.assert_frame_equal(parsed, expected, check_categorical=False) + tm.assert_frame_equal(parsed, expected) # File containing strls def test_read_dta12(self): @@ -952,19 +961,27 @@ def test_categorical_writing(self, version): original = pd.concat( [original[col].astype("category") for col in original], axis=1 ) + expected.index.name = "index" expected["incompletely_labeled"] = expected["incompletely_labeled"].apply(str) expected["unlabeled"] = expected["unlabeled"].apply(str) - expected = pd.concat( - [expected[col].astype("category") for col in expected], axis=1 - ) - expected.index.name = "index" + for col in expected: + orig = expected[col].copy() + + cat = orig.astype("category")._values + cat = cat.as_ordered() + if col == "unlabeled": + cat = cat.set_categories(orig, ordered=True) + + cat.categories.rename(None, inplace=True) + + expected[col] = cat with tm.ensure_clean() as path: original.to_stata(path, version=version) written_and_read_again = self.read_dta(path) res = written_and_read_again.set_index("index") - tm.assert_frame_equal(res, expected, check_categorical=False) + tm.assert_frame_equal(res, expected) def test_categorical_warnings_and_errors(self): # Warning for non-string labels @@ -1009,7 +1026,14 @@ def test_categorical_with_stata_missing_values(self, version): original.to_stata(path, version=version) written_and_read_again = self.read_dta(path) res = written_and_read_again.set_index("index") - tm.assert_frame_equal(res, original, check_categorical=False) + + expected = original.copy() + for col in expected: + cat = expected[col]._values + new_cats = cat.remove_unused_categories().categories + cat = cat.set_categories(new_cats, ordered=True) + expected[col] = cat + tm.assert_frame_equal(res, expected) @pytest.mark.parametrize("file", ["dta19_115", "dta19_117"]) def test_categorical_order(self, file): @@ -1027,7 +1051,9 @@ def test_categorical_order(self, file): cols = [] for is_cat, col, labels, codes in expected: if is_cat: - cols.append((col, pd.Categorical.from_codes(codes, labels))) + cols.append( + (col, pd.Categorical.from_codes(codes, labels, ordered=True)) + ) else: cols.append((col, pd.Series(labels, dtype=np.float32))) expected = DataFrame.from_dict(dict(cols)) @@ -1035,7 +1061,7 @@ def test_categorical_order(self, file): # Read with and with out categoricals, ensure order is identical file = getattr(self, file) parsed = read_stata(file) - tm.assert_frame_equal(expected, parsed, check_categorical=False) + tm.assert_frame_equal(expected, parsed) # Check identity of codes for col in expected: @@ -1056,9 +1082,11 @@ def test_categorical_sorting(self, file): parsed.index = np.arange(parsed.shape[0]) codes = [-1, -1, 0, 1, 1, 1, 2, 2, 3, 4] categories = ["Poor", "Fair", "Good", "Very good", "Excellent"] - cat = pd.Categorical.from_codes(codes=codes, categories=categories) + cat = pd.Categorical.from_codes( + codes=codes, categories=categories, ordered=True + ) expected = pd.Series(cat, name="srh") - tm.assert_series_equal(expected, parsed["srh"], check_categorical=False) + tm.assert_series_equal(expected, parsed["srh"]) @pytest.mark.parametrize("file", ["dta19_115", "dta19_117"]) def test_categorical_ordering(self, file): @@ -1118,18 +1146,30 @@ def test_read_chunks_117( chunk = itr.read(chunksize) except StopIteration: break - from_frame = parsed.iloc[pos : pos + chunksize, :] + from_frame = parsed.iloc[pos : pos + chunksize, :].copy() + from_frame = self._convert_categorical(from_frame) tm.assert_frame_equal( - from_frame, - chunk, - check_dtype=False, - check_datetimelike_compat=True, - check_categorical=False, + from_frame, chunk, check_dtype=False, check_datetimelike_compat=True, ) pos += chunksize itr.close() + @staticmethod + def _convert_categorical(from_frame: DataFrame) -> DataFrame: + """ + Emulate the categorical casting behavior we expect from roundtripping. + """ + for col in from_frame: + ser = from_frame[col] + if is_categorical_dtype(ser.dtype): + cat = ser._values.remove_unused_categories() + if cat.categories.dtype == object: + categories = pd.Index(cat.categories._values) + cat = cat.set_categories(categories) + from_frame[col] = cat + return from_frame + def test_iterator(self): fname = self.dta3_117 @@ -1204,13 +1244,10 @@ def test_read_chunks_115( chunk = itr.read(chunksize) except StopIteration: break - from_frame = parsed.iloc[pos : pos + chunksize, :] + from_frame = parsed.iloc[pos : pos + chunksize, :].copy() + from_frame = self._convert_categorical(from_frame) tm.assert_frame_equal( - from_frame, - chunk, - check_dtype=False, - check_datetimelike_compat=True, - check_categorical=False, + from_frame, chunk, check_dtype=False, check_datetimelike_compat=True, ) pos += chunksize diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index ea0ec8ad98ffe..f2f7b37170ec9 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -1,6 +1,3 @@ -#!/usr/bin/env python3 -# coding: utf-8 - import os import warnings @@ -34,6 +31,7 @@ def setup_method(self, method): self.mpl_ge_2_2_3 = compat._mpl_ge_2_2_3() self.mpl_ge_3_0_0 = compat._mpl_ge_3_0_0() self.mpl_ge_3_1_0 = compat._mpl_ge_3_1_0() + self.mpl_ge_3_2_0 = compat._mpl_ge_3_2_0() self.bp_n_objects = 7 self.polycollection_factor = 2 diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index b84fcffe26991..0a096acc9fa6d 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -1,5 +1,3 @@ -# coding: utf-8 - import itertools import string diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 979b89a87d843..b85a2affc4e4b 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -1468,7 +1468,9 @@ def test_matplotlib_scatter_datetime64(self): ax.scatter(x="time", y="y", data=df) self.plt.draw() label = ax.get_xticklabels()[0] - if self.mpl_ge_3_0_0: + if self.mpl_ge_3_2_0: + expected = "2018-01-01" + elif self.mpl_ge_3_0_0: expected = "2017-12-08" else: expected = "2017-12-12" diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index ffbd135466709..c84a09f21f46b 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -1,5 +1,3 @@ -# coding: utf-8 - """ Test cases for DataFrame.plot """ from datetime import date, datetime @@ -1308,6 +1306,13 @@ def test_plot_scatter_with_c(self): float_array = np.array([0.0, 1.0]) df.plot.scatter(x="A", y="B", c=float_array, cmap="spring") + def test_plot_scatter_with_s(self): + # this refers to GH 32904 + df = DataFrame(np.random.random((10, 3)) * 100, columns=["a", "b", "c"],) + + ax = df.plot.scatter(x="a", y="b", s="c") + tm.assert_numpy_array_equal(df["c"].values, right=ax.collections[0].get_sizes()) + def test_scatter_colors(self): df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]}) with pytest.raises(TypeError): @@ -1327,6 +1332,20 @@ def test_scatter_colors(self): np.array([1, 1, 1, 1], dtype=np.float64), ) + def test_scatter_colorbar_different_cmap(self): + # GH 33389 + import matplotlib.pyplot as plt + + df = pd.DataFrame({"x": [1, 2, 3], "y": [1, 3, 2], "c": [1, 2, 3]}) + df["x2"] = df["x"] + 1 + + fig, ax = plt.subplots() + df.plot("x", "y", c="c", kind="scatter", cmap="cividis", ax=ax) + df.plot("x2", "y", c="c", kind="scatter", cmap="magma", ax=ax) + + assert ax.collections[0].cmap.name == "cividis" + assert ax.collections[1].cmap.name == "magma" + @pytest.mark.slow def test_plot_bar(self): df = DataFrame( @@ -1677,6 +1696,25 @@ def test_hist_df(self): axes = df.plot.hist(rot=50, fontsize=8, orientation="horizontal") self._check_ticks_props(axes, xrot=0, yrot=50, ylabelsize=8) + @pytest.mark.parametrize( + "weights", [0.1 * np.ones(shape=(100,)), 0.1 * np.ones(shape=(100, 2))] + ) + def test_hist_weights(self, weights): + # GH 33173 + np.random.seed(0) + df = pd.DataFrame(dict(zip(["A", "B"], np.random.randn(2, 100,)))) + + ax1 = _check_plot_works(df.plot, kind="hist", weights=weights) + ax2 = _check_plot_works(df.plot, kind="hist") + + patch_height_with_weights = [patch.get_height() for patch in ax1.patches] + + # original heights with no weights, and we manually multiply with example + # weights, so after multiplication, they should be almost same + expected_patch_height = [0.1 * patch.get_height() for patch in ax2.patches] + + tm.assert_almost_equal(patch_height_with_weights, expected_patch_height) + def _check_box_coord( self, patches, @@ -2036,12 +2074,6 @@ def test_line_colors(self): self._check_colors(ax.get_lines(), linecolors=custom_colors) tm.close() - with pytest.raises(ValueError): - # Color contains shorthand hex value results in ValueError - custom_colors = ["#F00", "#00F", "#FF0", "#000", "#FFF"] - # Forced show plot - _check_plot_works(df.plot, color=custom_colors) - @pytest.mark.slow def test_dont_modify_colors(self): colors = ["r", "g", "b"] @@ -2093,14 +2125,6 @@ def test_line_colors_and_styles_subplots(self): self._check_colors(ax.get_lines(), linecolors=[c]) tm.close() - with pytest.raises(ValueError): - # Color contains shorthand hex value results in ValueError - custom_colors = ["#F00", "#00F", "#FF0", "#000", "#FFF"] - # Forced show plot - # _check_plot_works adds an ax so catch warning. see GH #13188 - with tm.assert_produces_warning(UserWarning): - _check_plot_works(df.plot, color=custom_colors, subplots=True) - rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] for cmap in ["jet", cm.jet]: axes = df.plot(colormap=cmap, subplots=True) @@ -3316,6 +3340,16 @@ def test_missing_markers_legend_using_style(self): self._check_legend_labels(ax, labels=["A", "B", "C"]) self._check_legend_marker(ax, expected_markers=[".", ".", "."]) + def test_colors_of_columns_with_same_name(self): + # ISSUE 11136 -> https://github.com/pandas-dev/pandas/issues/11136 + # Creating a DataFrame with duplicate column labels and testing colors of them. + df = pd.DataFrame({"b": [0, 1, 0], "a": [1, 2, 3]}) + df1 = pd.DataFrame({"a": [2, 4, 6]}) + df_concat = pd.concat([df, df1], axis=1) + result = df_concat.plot() + for legend, line in zip(result.get_legend().legendHandles, result.lines): + assert legend.get_color() == line.get_color() + def _generate_4_axes_via_gridspec(): import matplotlib.pyplot as plt diff --git a/pandas/tests/plotting/test_groupby.py b/pandas/tests/plotting/test_groupby.py index 8fec4bb134cb4..238639bd3732d 100644 --- a/pandas/tests/plotting/test_groupby.py +++ b/pandas/tests/plotting/test_groupby.py @@ -1,5 +1,3 @@ -# coding: utf-8 - """ Test cases for GroupBy.plot """ diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index 50ebbc22f2739..fba4f07f6cc0f 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -1,5 +1,3 @@ -# coding: utf-8 - """ Test cases for .hist method """ import numpy as np diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 168e8c7de0b83..27039948dfc16 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -1,5 +1,3 @@ -# coding: utf-8 - """ Test cases for misc plot functions """ import numpy as np diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 8463f30bee8f0..5341878d4986e 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -1,5 +1,3 @@ -# coding: utf-8 - """ Test cases for Series.plot """ diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 211d0d52d8357..fa62d5d8c4983 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -55,9 +55,7 @@ def test_ops(self, opname, obj): if not isinstance(obj, PeriodIndex): expected = getattr(obj.values, opname)() else: - expected = pd.Period( - ordinal=getattr(obj._ndarray_values, opname)(), freq=obj.freq - ) + expected = pd.Period(ordinal=getattr(obj.asi8, opname)(), freq=obj.freq) try: assert result == expected except TypeError: @@ -67,27 +65,58 @@ def test_ops(self, opname, obj): assert result.value == expected @pytest.mark.parametrize("opname", ["max", "min"]) - def test_nanops(self, opname, index_or_series): + @pytest.mark.parametrize( + "dtype, val", + [ + ("object", 2.0), + ("float64", 2.0), + ("datetime64[ns]", datetime(2011, 11, 1)), + ("Int64", 2), + ("boolean", True), + ], + ) + def test_nanminmax(self, opname, dtype, val, index_or_series): # GH#7261 klass = index_or_series - arg_op = "arg" + opname if klass is Index else "idx" + opname - obj = klass([np.nan, 2.0]) - assert getattr(obj, opname)() == 2.0 + if dtype in ["Int64", "boolean"] and klass == pd.Index: + pytest.skip("EAs can't yet be stored in an index") - obj = klass([np.nan]) - assert pd.isna(getattr(obj, opname)()) - assert pd.isna(getattr(obj, opname)(skipna=False)) + def check_missing(res): + if dtype == "datetime64[ns]": + return res is pd.NaT + elif dtype == "Int64": + return res is pd.NA + else: + return pd.isna(res) - obj = klass([], dtype=object) - assert pd.isna(getattr(obj, opname)()) - assert pd.isna(getattr(obj, opname)(skipna=False)) + obj = klass([None], dtype=dtype) + assert check_missing(getattr(obj, opname)()) + assert check_missing(getattr(obj, opname)(skipna=False)) - obj = klass([pd.NaT, datetime(2011, 11, 1)]) - # check DatetimeIndex monotonic path - assert getattr(obj, opname)() == datetime(2011, 11, 1) - assert getattr(obj, opname)(skipna=False) is pd.NaT + obj = klass([], dtype=dtype) + assert check_missing(getattr(obj, opname)()) + assert check_missing(getattr(obj, opname)(skipna=False)) + + if dtype == "object": + # generic test with object only works for empty / all NaN + return + obj = klass([None, val], dtype=dtype) + assert getattr(obj, opname)() == val + assert check_missing(getattr(obj, opname)(skipna=False)) + + obj = klass([None, val, None], dtype=dtype) + assert getattr(obj, opname)() == val + assert check_missing(getattr(obj, opname)(skipna=False)) + + @pytest.mark.parametrize("opname", ["max", "min"]) + def test_nanargminmax(self, opname, index_or_series): + # GH#7261 + klass = index_or_series + arg_op = "arg" + opname if klass is Index else "idx" + opname + + obj = klass([pd.NaT, datetime(2011, 11, 1)]) assert getattr(obj, arg_op)() == 1 result = getattr(obj, arg_op)(skipna=False) if klass is Series: @@ -97,9 +126,6 @@ def test_nanops(self, opname, index_or_series): obj = klass([pd.NaT, datetime(2011, 11, 1), pd.NaT]) # check DatetimeIndex non-monotonic path - assert getattr(obj, opname)(), datetime(2011, 11, 1) - assert getattr(obj, opname)(skipna=False) is pd.NaT - assert getattr(obj, arg_op)() == 1 result = getattr(obj, arg_op)(skipna=False) if klass is Series: @@ -533,13 +559,14 @@ def test_sum_inf(self): res = nanops.nansum(arr, axis=1) assert np.isinf(res).all() + @pytest.mark.parametrize("dtype", ["float64", "Int64", "boolean", "object"]) @pytest.mark.parametrize("use_bottleneck", [True, False]) @pytest.mark.parametrize("method, unit", [("sum", 0.0), ("prod", 1.0)]) - def test_empty(self, method, unit, use_bottleneck): + def test_empty(self, method, unit, use_bottleneck, dtype): with pd.option_context("use_bottleneck", use_bottleneck): # GH#9422 / GH#18921 # Entirely empty - s = Series([], dtype=object) + s = Series([], dtype=dtype) # NA by default result = getattr(s, method)() assert result == unit @@ -562,8 +589,14 @@ def test_empty(self, method, unit, use_bottleneck): result = getattr(s, method)(skipna=True, min_count=1) assert pd.isna(result) + result = getattr(s, method)(skipna=False, min_count=0) + assert result == unit + + result = getattr(s, method)(skipna=False, min_count=1) + assert pd.isna(result) + # All-NA - s = Series([np.nan]) + s = Series([np.nan], dtype=dtype) # NA by default result = getattr(s, method)() assert result == unit @@ -587,7 +620,7 @@ def test_empty(self, method, unit, use_bottleneck): assert pd.isna(result) # Mix of valid, empty - s = Series([np.nan, 1]) + s = Series([np.nan, 1], dtype=dtype) # Default result = getattr(s, method)() assert result == 1.0 @@ -606,22 +639,22 @@ def test_empty(self, method, unit, use_bottleneck): result = getattr(s, method)(skipna=True, min_count=0) assert result == 1.0 - result = getattr(s, method)(skipna=True, min_count=1) - assert result == 1.0 - # GH#844 (changed in GH#9422) - df = DataFrame(np.empty((10, 0))) + df = DataFrame(np.empty((10, 0)), dtype=dtype) assert (getattr(df, method)(1) == unit).all() - s = pd.Series([1]) + s = pd.Series([1], dtype=dtype) result = getattr(s, method)(min_count=2) assert pd.isna(result) - s = pd.Series([np.nan]) + result = getattr(s, method)(skipna=False, min_count=2) + assert pd.isna(result) + + s = pd.Series([np.nan], dtype=dtype) result = getattr(s, method)(min_count=2) assert pd.isna(result) - s = pd.Series([np.nan, 1]) + s = pd.Series([np.nan, 1], dtype=dtype) result = getattr(s, method)(min_count=2) assert pd.isna(result) @@ -858,6 +891,30 @@ def test_all_any_params(self): with pytest.raises(NotImplementedError): s.all(bool_only=True) + def test_all_any_boolean(self): + # Check skipna, with boolean type + s1 = Series([pd.NA, True], dtype="boolean") + s2 = Series([pd.NA, False], dtype="boolean") + assert s1.all(skipna=False) is pd.NA # NA && True => NA + assert s1.all(skipna=True) + assert s2.any(skipna=False) is pd.NA # NA || False => NA + assert not s2.any(skipna=True) + + # GH-33253: all True / all False values buggy with skipna=False + s3 = Series([True, True], dtype="boolean") + s4 = Series([False, False], dtype="boolean") + assert s3.all(skipna=False) + assert not s4.any(skipna=False) + + # Check level TODO(GH-33449) result should also be boolean + s = pd.Series( + [False, False, True, True, False, True], + index=[0, 0, 1, 1, 2, 2], + dtype="boolean", + ) + tm.assert_series_equal(s.all(level=0), Series([False, True, False])) + tm.assert_series_equal(s.any(level=0), Series([False, True, True])) + def test_timedelta64_analytics(self): # index min/max diff --git a/pandas/tests/resample/conftest.py b/pandas/tests/resample/conftest.py index d5b71a6e4cee1..fa53e49269f8b 100644 --- a/pandas/tests/resample/conftest.py +++ b/pandas/tests/resample/conftest.py @@ -134,7 +134,7 @@ def series(index, _series_name, _static_values): @pytest.fixture -def empty_series(series): +def empty_series_dti(series): """ Fixture for parametrization of empty Series with date_range, period_range and timedelta_range indexes @@ -153,7 +153,7 @@ def frame(index, _series_name, _static_values): @pytest.fixture -def empty_frame(series): +def empty_frame_dti(series): """ Fixture for parametrization of empty DataFrame with date_range, period_range and timedelta_range indexes diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index c84a5bf653b0a..6384c5f19c898 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -94,13 +94,13 @@ def test_raises_on_non_datetimelike_index(): @all_ts @pytest.mark.parametrize("freq", ["M", "D", "H"]) -def test_resample_empty_series(freq, empty_series, resample_method): +def test_resample_empty_series(freq, empty_series_dti, resample_method): # GH12771 & GH12868 if resample_method == "ohlc": pytest.skip("need to test for ohlc from GH13083") - s = empty_series + s = empty_series_dti result = getattr(s.resample(freq), resample_method)() expected = s.copy() @@ -114,22 +114,22 @@ def test_resample_empty_series(freq, empty_series, resample_method): @all_ts @pytest.mark.parametrize("freq", ["M", "D", "H"]) @pytest.mark.parametrize("resample_method", ["count", "size"]) -def test_resample_count_empty_series(freq, empty_series, resample_method): +def test_resample_count_empty_series(freq, empty_series_dti, resample_method): # GH28427 - result = getattr(empty_series.resample(freq), resample_method)() + result = getattr(empty_series_dti.resample(freq), resample_method)() - index = _asfreq_compat(empty_series.index, freq) + index = _asfreq_compat(empty_series_dti.index, freq) - expected = pd.Series([], dtype="int64", index=index, name=empty_series.name) + expected = pd.Series([], dtype="int64", index=index, name=empty_series_dti.name) tm.assert_series_equal(result, expected) @all_ts @pytest.mark.parametrize("freq", ["M", "D", "H"]) -def test_resample_empty_dataframe(empty_frame, freq, resample_method): +def test_resample_empty_dataframe(empty_frame_dti, freq, resample_method): # GH13212 - df = empty_frame + df = empty_frame_dti # count retains dimensions too result = getattr(df.resample(freq), resample_method)() if resample_method != "size": @@ -149,15 +149,14 @@ def test_resample_empty_dataframe(empty_frame, freq, resample_method): @all_ts @pytest.mark.parametrize("freq", ["M", "D", "H"]) -def test_resample_count_empty_dataframe(freq, empty_frame): +def test_resample_count_empty_dataframe(freq, empty_frame_dti): # GH28427 - empty_frame = empty_frame.copy() - empty_frame["a"] = [] + empty_frame_dti["a"] = [] - result = empty_frame.resample(freq).count() + result = empty_frame_dti.resample(freq).count() - index = _asfreq_compat(empty_frame.index, freq) + index = _asfreq_compat(empty_frame_dti.index, freq) expected = pd.DataFrame({"a": []}, dtype="int64", index=index) @@ -166,15 +165,14 @@ def test_resample_count_empty_dataframe(freq, empty_frame): @all_ts @pytest.mark.parametrize("freq", ["M", "D", "H"]) -def test_resample_size_empty_dataframe(freq, empty_frame): +def test_resample_size_empty_dataframe(freq, empty_frame_dti): # GH28427 - empty_frame = empty_frame.copy() - empty_frame["a"] = [] + empty_frame_dti["a"] = [] - result = empty_frame.resample(freq).size() + result = empty_frame_dti.resample(freq).size() - index = _asfreq_compat(empty_frame.index, freq) + index = _asfreq_compat(empty_frame_dti.index, freq) expected = pd.Series([], dtype="int64", index=index) @@ -188,9 +186,9 @@ def test_resample_empty_dtypes(index, dtype, resample_method): # Empty series were sometimes causing a segfault (for the functions # with Cython bounds-checking disabled) or an IndexError. We just run # them to ensure they no longer do. (GH #10228) - empty_series = Series([], index, dtype) + empty_series_dti = Series([], index, dtype) try: - getattr(empty_series.resample("d"), resample_method)() + getattr(empty_series_dti.resample("d"), resample_method)() except DataError: # Ignore these since some combinations are invalid # (ex: doing mean with dtype of np.object) @@ -227,9 +225,9 @@ def test_resample_loffset_arg_type(frame, create_index, arg): @all_ts -def test_apply_to_empty_series(empty_series): +def test_apply_to_empty_series(empty_series_dti): # GH 14313 - s = empty_series + s = empty_series_dti for freq in ["M", "D", "H"]: result = s.resample(freq).apply(lambda x: 1) expected = s.resample(freq).apply(np.sum) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 3ad82b9e075a8..f15d39e9e6456 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -122,9 +122,7 @@ def test_resample_integerarray(): result = ts.resample("3T").mean() expected = Series( - [1, 4, 7], - index=pd.date_range("1/1/2000", periods=3, freq="3T"), - dtype="float64", + [1, 4, 7], index=pd.date_range("1/1/2000", periods=3, freq="3T"), dtype="Int64", ) tm.assert_series_equal(result, expected) @@ -1440,6 +1438,24 @@ def test_downsample_across_dst_weekly(): tm.assert_series_equal(result, expected) +def test_downsample_dst_at_midnight(): + # GH 25758 + start = datetime(2018, 11, 3, 12) + end = datetime(2018, 11, 5, 12) + index = pd.date_range(start, end, freq="1H") + index = index.tz_localize("UTC").tz_convert("America/Havana") + data = list(range(len(index))) + dataframe = pd.DataFrame(data, index=index) + result = dataframe.groupby(pd.Grouper(freq="1D")).mean() + expected = DataFrame( + [7.5, 28.0, 44.5], + index=date_range("2018-11-03", periods=3).tz_localize( + "America/Havana", ambiguous=True + ), + ) + tm.assert_frame_equal(result, expected) + + def test_resample_with_nat(): # GH 13020 index = DatetimeIndex( diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index d552241f9126f..5044a18e33248 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -287,7 +287,7 @@ def test_agg_consistency(): r = df.resample("3T") - msg = "nested renamer is not supported" + msg = r"Column\(s\) \['r1', 'r2'\] do not exist" with pytest.raises(pd.core.base.SpecificationError, match=msg): r.agg({"r1": "mean", "r2": "sum"}) @@ -419,7 +419,7 @@ def test_agg_misc(): [("result1", "A"), ("result1", "B"), ("result2", "A"), ("result2", "B")] ) - msg = "nested renamer is not supported" + msg = r"Column\(s\) \['result1', 'result2'\] do not exist" for t in cases: with pytest.raises(pd.core.base.SpecificationError, match=msg): t[["A", "B"]].agg(OrderedDict([("result1", np.sum), ("result2", np.mean)])) @@ -440,6 +440,8 @@ def test_agg_misc(): result = t[["A", "B"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) tm.assert_frame_equal(result, expected, check_like=True) + msg = "nested renamer is not supported" + # series like aggs for t in cases: with pytest.raises(pd.core.base.SpecificationError, match=msg): @@ -580,3 +582,27 @@ def test_agg_with_datetime_index_list_agg_func(col_name): columns=pd.MultiIndex(levels=[[col_name], ["mean"]], codes=[[0], [0]]), ) tm.assert_frame_equal(result, expected) + + +def test_resample_agg_readonly(): + # GH#31710 cython needs to allow readonly data + index = pd.date_range("2020-01-01", "2020-01-02", freq="1h") + arr = np.zeros_like(index) + arr.setflags(write=False) + + ser = pd.Series(arr, index=index) + rs = ser.resample("1D") + + expected = pd.Series([pd.Timestamp(0), pd.Timestamp(0)], index=index[::24]) + + result = rs.agg("last") + tm.assert_series_equal(result, expected) + + result = rs.agg("first") + tm.assert_series_equal(result, expected) + + result = rs.agg("max") + tm.assert_series_equal(result, expected) + + result = rs.agg("min") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 725157b7c8523..dc1efa46403be 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -298,7 +298,7 @@ def test_join_on_inner(self): expected = df.join(df2, on="key") expected = expected[expected["value"].notna()] - tm.assert_series_equal(joined["key"], expected["key"], check_dtype=False) + tm.assert_series_equal(joined["key"], expected["key"]) tm.assert_series_equal(joined["value"], expected["value"], check_dtype=False) tm.assert_index_equal(joined.index, expected.index) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 4f2cd878df613..a92e628960456 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1286,17 +1286,17 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index): # GH 24212 # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that # -1 is interpreted as a missing value instead of the last element - df1 = pd.DataFrame({"a": [1, 2, 3], "key": [0, 2, 2]}, index=index) - df2 = pd.DataFrame({"b": [1, 2, 3, 4, 5]}) + df1 = pd.DataFrame({"a": [0, 1, 2], "key": [0, 1, 2]}, index=index) + df2 = pd.DataFrame({"b": [0, 1, 2, 3, 4, 5]}) result = df1.merge(df2, left_on="key", right_index=True, how=how) expected = pd.DataFrame( [ - [1.0, 0, 1], - [2.0, 2, 3], - [3.0, 2, 3], - [np.nan, 1, 2], - [np.nan, 3, 4], - [np.nan, 4, 5], + [0, 0, 0], + [1, 1, 1], + [2, 2, 2], + [np.nan, 3, 3], + [np.nan, 4, 4], + [np.nan, 5, 5], ], columns=["a", "key", "b"], ) @@ -1318,6 +1318,20 @@ def test_merge_right_index_right(self): result = left.merge(right, left_on="key", right_index=True, how="right") tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("how", ["left", "right"]) + def test_merge_preserves_row_order(self, how): + # GH 27453 + left_df = pd.DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]}) + right_df = pd.DataFrame({"animal": ["quetzal", "pig"], "max_speed": [80, 11]}) + result = left_df.merge(right_df, on=["animal", "max_speed"], how=how) + if how == "right": + expected = pd.DataFrame( + {"animal": ["quetzal", "pig"], "max_speed": [80, 11]} + ) + else: + expected = pd.DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]}) + tm.assert_frame_equal(result, expected) + def test_merge_take_missing_values_from_index_of_other_dtype(self): # GH 24212 left = pd.DataFrame( @@ -1350,7 +1364,7 @@ def test_merge_readonly(self): np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"] ) - data1._data.blocks[0].values.flags.writeable = False + data1._mgr.blocks[0].values.flags.writeable = False data1.merge(data2) # no error @@ -2010,6 +2024,36 @@ def test_merge_suffix(col1, col2, kwargs, expected_cols): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "how,expected", + [ + ( + "right", + DataFrame( + {"A": [100, 200, 300], "B1": [60, 70, np.nan], "B2": [600, 700, 800]} + ), + ), + ( + "outer", + DataFrame( + { + "A": [100, 200, 1, 300], + "B1": [60, 70, 80, np.nan], + "B2": [600, 700, np.nan, 800], + } + ), + ), + ], +) +def test_merge_duplicate_suffix(how, expected): + left_df = DataFrame({"A": [100, 200, 1], "B": [60, 70, 80]}) + right_df = DataFrame({"A": [100, 200, 300], "B": [600, 700, 800]}) + result = merge(left_df, right_df, on="A", how=how, suffixes=("_x", "_x")) + expected.columns = ["A", "B_x", "B_x"] + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "col1, col2, suffixes", [ @@ -2077,8 +2121,7 @@ def test_merge_equal_cat_dtypes(cat_dtype, reverse): } ).set_index("foo") - # Categorical is unordered, so don't check ordering. - tm.assert_frame_equal(result, expected, check_categorical=False) + tm.assert_frame_equal(result, expected) def test_merge_equal_cat_dtypes2(): @@ -2100,8 +2143,7 @@ def test_merge_equal_cat_dtypes2(): {"left": [1, 2], "right": [3, 2], "foo": Series(["a", "b"]).astype(cat_dtype)} ).set_index("foo") - # Categorical is unordered, so don't check ordering. - tm.assert_frame_equal(result, expected, check_categorical=False) + tm.assert_frame_equal(result, expected) def test_merge_on_cat_and_ext_array(): @@ -2163,3 +2205,25 @@ def test_merge_datetime_upcast_dtype(): } ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("n_categories", [5, 128]) +def test_categorical_non_unique_monotonic(n_categories): + # GH 28189 + # With n_categories as 5, we test the int8 case is hit in libjoin, + # with n_categories as 128 we test the int16 case. + left_index = CategoricalIndex([0] + list(range(n_categories))) + df1 = DataFrame(range(n_categories + 1), columns=["value"], index=left_index) + df2 = DataFrame( + [[6]], + columns=["value"], + index=CategoricalIndex([0], categories=np.arange(n_categories)), + ) + + result = merge(df1, df2, how="left", left_index=True, right_index=True) + expected = DataFrame( + [[i, 6.0] if i < 2 else [i, np.nan] for i in range(n_categories + 1)], + columns=["value_x", "value_y"], + index=left_index, + ) + tm.assert_frame_equal(expected, result) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index afd8f4178f741..bccae2c4c2772 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1113,28 +1113,28 @@ def test_concat_copy(self): # These are actual copies. result = concat([df, df2, df3], axis=1, copy=True) - for b in result._data.blocks: + for b in result._mgr.blocks: assert b.values.base is None # These are the same. result = concat([df, df2, df3], axis=1, copy=False) - for b in result._data.blocks: + for b in result._mgr.blocks: if b.is_float: - assert b.values.base is df._data.blocks[0].values.base + assert b.values.base is df._mgr.blocks[0].values.base elif b.is_integer: - assert b.values.base is df2._data.blocks[0].values.base + assert b.values.base is df2._mgr.blocks[0].values.base elif b.is_object: assert b.values.base is not None # Float block was consolidated. df4 = DataFrame(np.random.randn(4, 1)) result = concat([df, df2, df3, df4], axis=1, copy=False) - for b in result._data.blocks: + for b in result._mgr.blocks: if b.is_float: assert b.values.base is None elif b.is_integer: - assert b.values.base is df2._data.blocks[0].values.base + assert b.values.base is df2._mgr.blocks[0].values.base elif b.is_object: assert b.values.base is not None @@ -1220,13 +1220,17 @@ def test_concat_series_partial_columns_names(self): expected = DataFrame({0: [1, 2], 1: [1, 2], 2: [4, 5]}) tm.assert_frame_equal(result, expected) - def test_concat_dict(self): - frames = { - "foo": DataFrame(np.random.randn(4, 3)), - "bar": DataFrame(np.random.randn(4, 3)), - "baz": DataFrame(np.random.randn(4, 3)), - "qux": DataFrame(np.random.randn(4, 3)), - } + @pytest.mark.parametrize("mapping", ["mapping", "dict"]) + def test_concat_mapping(self, mapping, non_dict_mapping_subclass): + constructor = dict if mapping == "dict" else non_dict_mapping_subclass + frames = constructor( + { + "foo": DataFrame(np.random.randn(4, 3)), + "bar": DataFrame(np.random.randn(4, 3)), + "baz": DataFrame(np.random.randn(4, 3)), + "qux": DataFrame(np.random.randn(4, 3)), + } + ) sorted_keys = list(frames.keys()) diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py new file mode 100644 index 0000000000000..8795af2e11122 --- /dev/null +++ b/pandas/tests/reshape/test_crosstab.py @@ -0,0 +1,700 @@ +import numpy as np +import pytest + +from pandas import CategoricalIndex, DataFrame, Index, MultiIndex, Series, crosstab +import pandas._testing as tm + + +class TestCrosstab: + def setup_method(self, method): + df = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) + + self.df = df.append(df, ignore_index=True) + + def test_crosstab_single(self): + df = self.df + result = crosstab(df["A"], df["C"]) + expected = df.groupby(["A", "C"]).size().unstack() + tm.assert_frame_equal(result, expected.fillna(0).astype(np.int64)) + + def test_crosstab_multiple(self): + df = self.df + + result = crosstab(df["A"], [df["B"], df["C"]]) + expected = df.groupby(["A", "B", "C"]).size() + expected = expected.unstack("B").unstack("C").fillna(0).astype(np.int64) + tm.assert_frame_equal(result, expected) + + result = crosstab([df["B"], df["C"]], df["A"]) + expected = df.groupby(["B", "C", "A"]).size() + expected = expected.unstack("A").fillna(0).astype(np.int64) + tm.assert_frame_equal(result, expected) + + def test_crosstab_ndarray(self): + a = np.random.randint(0, 5, size=100) + b = np.random.randint(0, 3, size=100) + c = np.random.randint(0, 10, size=100) + + df = DataFrame({"a": a, "b": b, "c": c}) + + result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c")) + expected = crosstab(df["a"], [df["b"], df["c"]]) + tm.assert_frame_equal(result, expected) + + result = crosstab([b, c], a, colnames=["a"], rownames=("b", "c")) + expected = crosstab([df["b"], df["c"]], df["a"]) + tm.assert_frame_equal(result, expected) + + # assign arbitrary names + result = crosstab(self.df["A"].values, self.df["C"].values) + assert result.index.name == "row_0" + assert result.columns.name == "col_0" + + def test_crosstab_non_aligned(self): + # GH 17005 + a = Series([0, 1, 1], index=["a", "b", "c"]) + b = Series([3, 4, 3, 4, 3], index=["a", "b", "c", "d", "f"]) + c = np.array([3, 4, 3]) + + expected = DataFrame( + [[1, 0], [1, 1]], + index=Index([0, 1], name="row_0"), + columns=Index([3, 4], name="col_0"), + ) + + result = crosstab(a, b) + tm.assert_frame_equal(result, expected) + + result = crosstab(a, c) + tm.assert_frame_equal(result, expected) + + def test_crosstab_margins(self): + a = np.random.randint(0, 7, size=100) + b = np.random.randint(0, 3, size=100) + c = np.random.randint(0, 5, size=100) + + df = DataFrame({"a": a, "b": b, "c": c}) + + result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"), margins=True) + + assert result.index.names == ("a",) + assert result.columns.names == ["b", "c"] + + all_cols = result["All", ""] + exp_cols = df.groupby(["a"]).size().astype("i8") + # to keep index.name + exp_margin = Series([len(df)], index=Index(["All"], name="a")) + exp_cols = exp_cols.append(exp_margin) + exp_cols.name = ("All", "") + + tm.assert_series_equal(all_cols, exp_cols) + + all_rows = result.loc["All"] + exp_rows = df.groupby(["b", "c"]).size().astype("i8") + exp_rows = exp_rows.append(Series([len(df)], index=[("All", "")])) + exp_rows.name = "All" + + exp_rows = exp_rows.reindex(all_rows.index) + exp_rows = exp_rows.fillna(0).astype(np.int64) + tm.assert_series_equal(all_rows, exp_rows) + + def test_crosstab_margins_set_margin_name(self): + # GH 15972 + a = np.random.randint(0, 7, size=100) + b = np.random.randint(0, 3, size=100) + c = np.random.randint(0, 5, size=100) + + df = DataFrame({"a": a, "b": b, "c": c}) + + result = crosstab( + a, + [b, c], + rownames=["a"], + colnames=("b", "c"), + margins=True, + margins_name="TOTAL", + ) + + assert result.index.names == ("a",) + assert result.columns.names == ["b", "c"] + + all_cols = result["TOTAL", ""] + exp_cols = df.groupby(["a"]).size().astype("i8") + # to keep index.name + exp_margin = Series([len(df)], index=Index(["TOTAL"], name="a")) + exp_cols = exp_cols.append(exp_margin) + exp_cols.name = ("TOTAL", "") + + tm.assert_series_equal(all_cols, exp_cols) + + all_rows = result.loc["TOTAL"] + exp_rows = df.groupby(["b", "c"]).size().astype("i8") + exp_rows = exp_rows.append(Series([len(df)], index=[("TOTAL", "")])) + exp_rows.name = "TOTAL" + + exp_rows = exp_rows.reindex(all_rows.index) + exp_rows = exp_rows.fillna(0).astype(np.int64) + tm.assert_series_equal(all_rows, exp_rows) + + msg = "margins_name argument must be a string" + for margins_name in [666, None, ["a", "b"]]: + with pytest.raises(ValueError, match=msg): + crosstab( + a, + [b, c], + rownames=["a"], + colnames=("b", "c"), + margins=True, + margins_name=margins_name, + ) + + def test_crosstab_pass_values(self): + a = np.random.randint(0, 7, size=100) + b = np.random.randint(0, 3, size=100) + c = np.random.randint(0, 5, size=100) + values = np.random.randn(100) + + table = crosstab( + [a, b], c, values, aggfunc=np.sum, rownames=["foo", "bar"], colnames=["baz"] + ) + + df = DataFrame({"foo": a, "bar": b, "baz": c, "values": values}) + + expected = df.pivot_table( + "values", index=["foo", "bar"], columns="baz", aggfunc=np.sum + ) + tm.assert_frame_equal(table, expected) + + def test_crosstab_dropna(self): + # GH 3820 + a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object) + b = np.array(["one", "one", "two", "one", "two", "two", "two"], dtype=object) + c = np.array( + ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object + ) + res = crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=False) + m = MultiIndex.from_tuples( + [("one", "dull"), ("one", "shiny"), ("two", "dull"), ("two", "shiny")], + names=["b", "c"], + ) + tm.assert_index_equal(res.columns, m) + + def test_crosstab_no_overlap(self): + # GS 10291 + + s1 = Series([1, 2, 3], index=[1, 2, 3]) + s2 = Series([4, 5, 6], index=[4, 5, 6]) + + actual = crosstab(s1, s2) + expected = DataFrame() + + tm.assert_frame_equal(actual, expected) + + def test_margin_dropna(self): + # GH 12577 + # pivot_table counts null into margin ('All') + # when margins=true and dropna=true + + df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) + actual = crosstab(df.a, df.b, margins=True, dropna=True) + expected = DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]]) + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3, 4, "All"], name="b") + tm.assert_frame_equal(actual, expected) + + df = DataFrame( + {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]} + ) + actual = crosstab(df.a, df.b, margins=True, dropna=True) + expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3.0, 4.0, "All"], name="b") + tm.assert_frame_equal(actual, expected) + + df = DataFrame( + {"a": [1, np.nan, np.nan, np.nan, np.nan, 2], "b": [3, 3, 4, 4, 4, 4]} + ) + actual = crosstab(df.a, df.b, margins=True, dropna=True) + expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3, 4, "All"], name="b") + tm.assert_frame_equal(actual, expected) + + # GH 12642 + # _add_margins raises KeyError: Level None not found + # when margins=True and dropna=False + df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) + actual = crosstab(df.a, df.b, margins=True, dropna=False) + expected = DataFrame([[1, 0, 1], [1, 3, 4], [2, 4, 6]]) + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3, 4, "All"], name="b") + tm.assert_frame_equal(actual, expected) + + df = DataFrame( + {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]} + ) + actual = crosstab(df.a, df.b, margins=True, dropna=False) + expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 4, 6]]) + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3.0, 4.0, "All"], name="b") + tm.assert_frame_equal(actual, expected) + + a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object) + b = np.array(["one", "one", "two", "one", "two", np.nan, "two"], dtype=object) + c = np.array( + ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object + ) + + actual = crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], margins=True, dropna=False + ) + m = MultiIndex.from_arrays( + [ + ["one", "one", "two", "two", "All"], + ["dull", "shiny", "dull", "shiny", ""], + ], + names=["b", "c"], + ) + expected = DataFrame( + [[1, 0, 1, 0, 2], [2, 0, 1, 1, 5], [3, 0, 2, 1, 7]], columns=m + ) + expected.index = Index(["bar", "foo", "All"], name="a") + tm.assert_frame_equal(actual, expected) + + actual = crosstab( + [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=False + ) + m = MultiIndex.from_arrays( + [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]], + names=["a", "b"], + ) + expected = DataFrame( + [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 2, 7]], index=m + ) + expected.columns = Index(["dull", "shiny", "All"], name="c") + tm.assert_frame_equal(actual, expected) + + actual = crosstab( + [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=True + ) + m = MultiIndex.from_arrays( + [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]], + names=["a", "b"], + ) + expected = DataFrame( + [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 1, 6]], index=m + ) + expected.columns = Index(["dull", "shiny", "All"], name="c") + tm.assert_frame_equal(actual, expected) + + def test_crosstab_normalize(self): + # Issue 12578 + df = DataFrame( + {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]} + ) + + rindex = Index([1, 2], name="a") + cindex = Index([3, 4], name="b") + full_normal = DataFrame([[0.2, 0], [0.2, 0.6]], index=rindex, columns=cindex) + row_normal = DataFrame([[1.0, 0], [0.25, 0.75]], index=rindex, columns=cindex) + col_normal = DataFrame([[0.5, 0], [0.5, 1.0]], index=rindex, columns=cindex) + + # Check all normalize args + tm.assert_frame_equal(crosstab(df.a, df.b, normalize="all"), full_normal) + tm.assert_frame_equal(crosstab(df.a, df.b, normalize=True), full_normal) + tm.assert_frame_equal(crosstab(df.a, df.b, normalize="index"), row_normal) + tm.assert_frame_equal(crosstab(df.a, df.b, normalize="columns"), col_normal) + tm.assert_frame_equal( + crosstab(df.a, df.b, normalize=1), + crosstab(df.a, df.b, normalize="columns"), + ) + tm.assert_frame_equal( + crosstab(df.a, df.b, normalize=0), crosstab(df.a, df.b, normalize="index"), + ) + + row_normal_margins = DataFrame( + [[1.0, 0], [0.25, 0.75], [0.4, 0.6]], + index=Index([1, 2, "All"], name="a", dtype="object"), + columns=Index([3, 4], name="b", dtype="object"), + ) + col_normal_margins = DataFrame( + [[0.5, 0, 0.2], [0.5, 1.0, 0.8]], + index=Index([1, 2], name="a", dtype="object"), + columns=Index([3, 4, "All"], name="b", dtype="object"), + ) + + all_normal_margins = DataFrame( + [[0.2, 0, 0.2], [0.2, 0.6, 0.8], [0.4, 0.6, 1]], + index=Index([1, 2, "All"], name="a", dtype="object"), + columns=Index([3, 4, "All"], name="b", dtype="object"), + ) + tm.assert_frame_equal( + crosstab(df.a, df.b, normalize="index", margins=True), row_normal_margins + ) + tm.assert_frame_equal( + crosstab(df.a, df.b, normalize="columns", margins=True), col_normal_margins, + ) + tm.assert_frame_equal( + crosstab(df.a, df.b, normalize=True, margins=True), all_normal_margins + ) + + # Test arrays + crosstab( + [np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])], np.array([1, 2, 1, 2]) + ) + + # Test with aggfunc + norm_counts = DataFrame( + [[0.25, 0, 0.25], [0.25, 0.5, 0.75], [0.5, 0.5, 1]], + index=Index([1, 2, "All"], name="a", dtype="object"), + columns=Index([3, 4, "All"], name="b"), + ) + test_case = crosstab( + df.a, df.b, df.c, aggfunc="count", normalize="all", margins=True + ) + tm.assert_frame_equal(test_case, norm_counts) + + df = DataFrame( + {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [0, 4, np.nan, 3, 3]} + ) + + norm_sum = DataFrame( + [[0, 0, 0.0], [0.4, 0.6, 1], [0.4, 0.6, 1]], + index=Index([1, 2, "All"], name="a", dtype="object"), + columns=Index([3, 4, "All"], name="b", dtype="object"), + ) + test_case = crosstab( + df.a, df.b, df.c, aggfunc=np.sum, normalize="all", margins=True + ) + tm.assert_frame_equal(test_case, norm_sum) + + def test_crosstab_with_empties(self): + # Check handling of empties + df = DataFrame( + { + "a": [1, 2, 2, 2, 2], + "b": [3, 3, 4, 4, 4], + "c": [np.nan, np.nan, np.nan, np.nan, np.nan], + } + ) + + empty = DataFrame( + [[0.0, 0.0], [0.0, 0.0]], + index=Index([1, 2], name="a", dtype="int64"), + columns=Index([3, 4], name="b"), + ) + + for i in [True, "index", "columns"]: + calculated = crosstab(df.a, df.b, values=df.c, aggfunc="count", normalize=i) + tm.assert_frame_equal(empty, calculated) + + nans = DataFrame( + [[0.0, np.nan], [0.0, 0.0]], + index=Index([1, 2], name="a", dtype="int64"), + columns=Index([3, 4], name="b"), + ) + + calculated = crosstab(df.a, df.b, values=df.c, aggfunc="count", normalize=False) + tm.assert_frame_equal(nans, calculated) + + def test_crosstab_errors(self): + # Issue 12578 + + df = DataFrame( + {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]} + ) + + error = "values cannot be used without an aggfunc." + with pytest.raises(ValueError, match=error): + crosstab(df.a, df.b, values=df.c) + + error = "aggfunc cannot be used without values" + with pytest.raises(ValueError, match=error): + crosstab(df.a, df.b, aggfunc=np.mean) + + error = "Not a valid normalize argument" + with pytest.raises(ValueError, match=error): + crosstab(df.a, df.b, normalize="42") + + with pytest.raises(ValueError, match=error): + crosstab(df.a, df.b, normalize=42) + + error = "Not a valid margins argument" + with pytest.raises(ValueError, match=error): + crosstab(df.a, df.b, normalize="all", margins=42) + + def test_crosstab_with_categorial_columns(self): + # GH 8860 + df = DataFrame( + { + "MAKE": ["Honda", "Acura", "Tesla", "Honda", "Honda", "Acura"], + "MODEL": ["Sedan", "Sedan", "Electric", "Pickup", "Sedan", "Sedan"], + } + ) + categories = ["Sedan", "Electric", "Pickup"] + df["MODEL"] = df["MODEL"].astype("category").cat.set_categories(categories) + result = crosstab(df["MAKE"], df["MODEL"]) + + expected_index = Index(["Acura", "Honda", "Tesla"], name="MAKE") + expected_columns = CategoricalIndex( + categories, categories=categories, ordered=False, name="MODEL" + ) + expected_data = [[2, 0, 0], [2, 0, 1], [0, 1, 0]] + expected = DataFrame( + expected_data, index=expected_index, columns=expected_columns + ) + tm.assert_frame_equal(result, expected) + + def test_crosstab_with_numpy_size(self): + # GH 4003 + df = DataFrame( + { + "A": ["one", "one", "two", "three"] * 6, + "B": ["A", "B", "C"] * 8, + "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, + "D": np.random.randn(24), + "E": np.random.randn(24), + } + ) + result = crosstab( + index=[df["A"], df["B"]], + columns=[df["C"]], + margins=True, + aggfunc=np.size, + values=df["D"], + ) + expected_index = MultiIndex( + levels=[["All", "one", "three", "two"], ["", "A", "B", "C"]], + codes=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0], [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]], + names=["A", "B"], + ) + expected_column = Index(["bar", "foo", "All"], dtype="object", name="C") + expected_data = np.array( + [ + [2.0, 2.0, 4.0], + [2.0, 2.0, 4.0], + [2.0, 2.0, 4.0], + [2.0, np.nan, 2.0], + [np.nan, 2.0, 2.0], + [2.0, np.nan, 2.0], + [np.nan, 2.0, 2.0], + [2.0, np.nan, 2.0], + [np.nan, 2.0, 2.0], + [12.0, 12.0, 24.0], + ] + ) + expected = DataFrame( + expected_data, index=expected_index, columns=expected_column + ) + tm.assert_frame_equal(result, expected) + + def test_crosstab_dup_index_names(self): + # GH 13279 + s = Series(range(3), name="foo") + + result = crosstab(s, s) + expected_index = Index(range(3), name="foo") + expected = DataFrame( + np.eye(3, dtype=np.int64), index=expected_index, columns=expected_index + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]]) + def test_crosstab_tuple_name(self, names): + s1 = Series(range(3), name=names[0]) + s2 = Series(range(1, 4), name=names[1]) + + mi = MultiIndex.from_arrays([range(3), range(1, 4)], names=names) + expected = Series(1, index=mi).unstack(1, fill_value=0) + + result = crosstab(s1, s2) + tm.assert_frame_equal(result, expected) + + def test_crosstab_both_tuple_names(self): + # GH 18321 + s1 = Series(range(3), name=("a", "b")) + s2 = Series(range(3), name=("c", "d")) + + expected = DataFrame( + np.eye(3, dtype="int64"), + index=Index(range(3), name=("a", "b")), + columns=Index(range(3), name=("c", "d")), + ) + result = crosstab(s1, s2) + tm.assert_frame_equal(result, expected) + + def test_crosstab_unsorted_order(self): + df = DataFrame({"b": [3, 1, 2], "a": [5, 4, 6]}, index=["C", "A", "B"]) + result = crosstab(df.index, [df.b, df.a]) + e_idx = Index(["A", "B", "C"], name="row_0") + e_columns = MultiIndex.from_tuples([(1, 4), (2, 6), (3, 5)], names=["b", "a"]) + expected = DataFrame( + [[1, 0, 0], [0, 1, 0], [0, 0, 1]], index=e_idx, columns=e_columns + ) + tm.assert_frame_equal(result, expected) + + def test_crosstab_normalize_multiple_columns(self): + # GH 15150 + df = DataFrame( + { + "A": ["one", "one", "two", "three"] * 6, + "B": ["A", "B", "C"] * 8, + "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, + "D": [0] * 24, + "E": [0] * 24, + } + ) + result = crosstab( + [df.A, df.B], + df.C, + values=df.D, + aggfunc=np.sum, + normalize=True, + margins=True, + ) + expected = DataFrame( + np.array([0] * 29 + [1], dtype=float).reshape(10, 3), + columns=Index(["bar", "foo", "All"], dtype="object", name="C"), + index=MultiIndex.from_tuples( + [ + ("one", "A"), + ("one", "B"), + ("one", "C"), + ("three", "A"), + ("three", "B"), + ("three", "C"), + ("two", "A"), + ("two", "B"), + ("two", "C"), + ("All", ""), + ], + names=["A", "B"], + ), + ) + tm.assert_frame_equal(result, expected) + + def test_margin_normalize(self): + # GH 27500 + df = DataFrame( + { + "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], + "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], + "C": [ + "small", + "large", + "large", + "small", + "small", + "large", + "small", + "small", + "large", + ], + "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], + } + ) + # normalize on index + result = crosstab( + [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=0 + ) + expected = DataFrame( + [[0.5, 0.5], [0.5, 0.5], [0.666667, 0.333333], [0, 1], [0.444444, 0.555556]] + ) + expected.index = MultiIndex( + levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], + codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], + names=["A", "B"], + ) + expected.columns = Index(["large", "small"], dtype="object", name="C") + tm.assert_frame_equal(result, expected) + + # normalize on columns + result = crosstab( + [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=1 + ) + expected = DataFrame( + [ + [0.25, 0.2, 0.222222], + [0.25, 0.2, 0.222222], + [0.5, 0.2, 0.333333], + [0, 0.4, 0.222222], + ] + ) + expected.columns = Index( + ["large", "small", "Sub-Total"], dtype="object", name="C" + ) + expected.index = MultiIndex( + levels=[["bar", "foo"], ["one", "two"]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + names=["A", "B"], + ) + tm.assert_frame_equal(result, expected) + + # normalize on both index and column + result = crosstab( + [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=True + ) + expected = DataFrame( + [ + [0.111111, 0.111111, 0.222222], + [0.111111, 0.111111, 0.222222], + [0.222222, 0.111111, 0.333333], + [0.000000, 0.222222, 0.222222], + [0.444444, 0.555555, 1], + ] + ) + expected.columns = Index( + ["large", "small", "Sub-Total"], dtype="object", name="C" + ) + expected.index = MultiIndex( + levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], + codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], + names=["A", "B"], + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_get_dummies.py similarity index 84% rename from pandas/tests/reshape/test_reshape.py rename to pandas/tests/reshape/test_get_dummies.py index 6113cfec48df9..c003bfa6a239a 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -6,7 +6,7 @@ from pandas.core.dtypes.common import is_integer_dtype import pandas as pd -from pandas import Categorical, DataFrame, Index, Series, get_dummies +from pandas import Categorical, CategoricalIndex, DataFrame, Series, get_dummies import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray, SparseDtype @@ -31,11 +31,11 @@ def effective_dtype(self, dtype): return np.uint8 return dtype - def test_raises_on_dtype_object(self, df): + def test_get_dummies_raises_on_dtype_object(self, df): with pytest.raises(ValueError): get_dummies(df, dtype="object") - def test_basic(self, sparse, dtype): + def test_get_dummies_basic(self, sparse, dtype): s_list = list("abc") s_series = Series(s_list) s_series_index = Series(s_list, list("ABC")) @@ -56,7 +56,7 @@ def test_basic(self, sparse, dtype): result = get_dummies(s_series_index, sparse=sparse, dtype=dtype) tm.assert_frame_equal(result, expected) - def test_basic_types(self, sparse, dtype): + def test_get_dummies_basic_types(self, sparse, dtype): # GH 10531 s_list = list("abc") s_series = Series(s_list) @@ -106,7 +106,7 @@ def test_basic_types(self, sparse, dtype): result = result.sort_index() tm.assert_series_equal(result, expected) - def test_just_na(self, sparse): + def test_get_dummies_just_na(self, sparse): just_na_list = [np.nan] just_na_series = Series(just_na_list) just_na_series_index = Series(just_na_list, index=["A"]) @@ -123,7 +123,7 @@ def test_just_na(self, sparse): assert res_series.index.tolist() == [0] assert res_series_index.index.tolist() == ["A"] - def test_include_na(self, sparse, dtype): + def test_get_dummies_include_na(self, sparse, dtype): s = ["a", "b", np.nan] res = get_dummies(s, sparse=sparse, dtype=dtype) exp = DataFrame( @@ -152,7 +152,7 @@ def test_include_na(self, sparse, dtype): ) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values) - def test_unicode(self, sparse): + def test_get_dummies_unicode(self, sparse): # See GH 6885 - get_dummies chokes on unicode values import unicodedata @@ -175,7 +175,7 @@ def test_dataframe_dummies_all_obj(self, df, sparse): dtype=np.uint8, ) if sparse: - expected = pd.DataFrame( + expected = DataFrame( { "A_a": SparseArray([1, 0, 1], dtype="uint8"), "A_b": SparseArray([0, 1, 0], dtype="uint8"), @@ -223,7 +223,7 @@ def test_dataframe_dummies_prefix_list(self, df, sparse): cols = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"] expected = expected[["C"] + cols] - typ = SparseArray if sparse else pd.Series + typ = SparseArray if sparse else Series expected[cols] = expected[cols].apply(lambda x: typ(x)) tm.assert_frame_equal(result, expected) @@ -242,11 +242,11 @@ def test_dataframe_dummies_prefix_str(self, df, sparse): # https://github.com/pandas-dev/pandas/issues/14427 expected = pd.concat( [ - pd.Series([1, 2, 3], name="C"), - pd.Series([1, 0, 1], name="bad_a", dtype="Sparse[uint8]"), - pd.Series([0, 1, 0], name="bad_b", dtype="Sparse[uint8]"), - pd.Series([1, 1, 0], name="bad_b", dtype="Sparse[uint8]"), - pd.Series([0, 0, 1], name="bad_c", dtype="Sparse[uint8]"), + Series([1, 2, 3], name="C"), + Series([1, 0, 1], name="bad_a", dtype="Sparse[uint8]"), + Series([0, 1, 0], name="bad_b", dtype="Sparse[uint8]"), + Series([1, 1, 0], name="bad_b", dtype="Sparse[uint8]"), + Series([0, 0, 1], name="bad_c", dtype="Sparse[uint8]"), ], axis=1, ) @@ -267,7 +267,7 @@ def test_dataframe_dummies_subset(self, df, sparse): expected[["C"]] = df[["C"]] if sparse: cols = ["from_A_a", "from_A_b"] - expected[cols] = expected[cols].astype(pd.SparseDtype("uint8", 0)) + expected[cols] = expected[cols].astype(SparseDtype("uint8", 0)) tm.assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_sep(self, df, sparse): @@ -286,7 +286,7 @@ def test_dataframe_dummies_prefix_sep(self, df, sparse): expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]] if sparse: cols = ["A..a", "A..b", "B..b", "B..c"] - expected[cols] = expected[cols].astype(pd.SparseDtype("uint8", 0)) + expected[cols] = expected[cols].astype(SparseDtype("uint8", 0)) tm.assert_frame_equal(result, expected) @@ -323,7 +323,7 @@ def test_dataframe_dummies_prefix_dict(self, sparse): columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"] expected[columns] = expected[columns].astype(np.uint8) if sparse: - expected[columns] = expected[columns].astype(pd.SparseDtype("uint8", 0)) + expected[columns] = expected[columns].astype(SparseDtype("uint8", 0)) tm.assert_frame_equal(result, expected) @@ -359,7 +359,7 @@ def test_dataframe_dummies_with_na(self, df, sparse, dtype): tm.assert_frame_equal(result, expected) def test_dataframe_dummies_with_categorical(self, df, sparse, dtype): - df["cat"] = pd.Categorical(["x", "y", "y"]) + df["cat"] = Categorical(["x", "y", "y"]) result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1) if sparse: arr = SparseArray @@ -386,30 +386,30 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype): "get_dummies_kwargs,expected", [ ( - {"data": pd.DataFrame(({"ä": ["a"]}))}, - pd.DataFrame({"ä_a": [1]}, dtype=np.uint8), + {"data": DataFrame(({"ä": ["a"]}))}, + DataFrame({"ä_a": [1]}, dtype=np.uint8), ), ( - {"data": pd.DataFrame({"x": ["ä"]})}, - pd.DataFrame({"x_ä": [1]}, dtype=np.uint8), + {"data": DataFrame({"x": ["ä"]})}, + DataFrame({"x_ä": [1]}, dtype=np.uint8), ), ( - {"data": pd.DataFrame({"x": ["a"]}), "prefix": "ä"}, - pd.DataFrame({"ä_a": [1]}, dtype=np.uint8), + {"data": DataFrame({"x": ["a"]}), "prefix": "ä"}, + DataFrame({"ä_a": [1]}, dtype=np.uint8), ), ( - {"data": pd.DataFrame({"x": ["a"]}), "prefix_sep": "ä"}, - pd.DataFrame({"xäa": [1]}, dtype=np.uint8), + {"data": DataFrame({"x": ["a"]}), "prefix_sep": "ä"}, + DataFrame({"xäa": [1]}, dtype=np.uint8), ), ], ) def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected): - # GH22084 pd.get_dummies incorrectly encodes unicode characters + # GH22084 get_dummies incorrectly encodes unicode characters # in dataframe column names result = get_dummies(**get_dummies_kwargs) tm.assert_frame_equal(result, expected) - def test_basic_drop_first(self, sparse): + def test_get_dummies_basic_drop_first(self, sparse): # GH12402 Add a new parameter `drop_first` to avoid collinearity # Basic case s_list = list("abc") @@ -430,7 +430,7 @@ def test_basic_drop_first(self, sparse): result = get_dummies(s_series_index, drop_first=True, sparse=sparse) tm.assert_frame_equal(result, expected) - def test_basic_drop_first_one_level(self, sparse): + def test_get_dummies_basic_drop_first_one_level(self, sparse): # Test the case that categorical variable only has one level. s_list = list("aaa") s_series = Series(s_list) @@ -448,7 +448,7 @@ def test_basic_drop_first_one_level(self, sparse): result = get_dummies(s_series_index, drop_first=True, sparse=sparse) tm.assert_frame_equal(result, expected) - def test_basic_drop_first_NA(self, sparse): + def test_get_dummies_basic_drop_first_NA(self, sparse): # Test NA handling together with drop_first s_NA = ["a", "b", np.nan] res = get_dummies(s_NA, drop_first=True, sparse=sparse) @@ -481,7 +481,7 @@ def test_dataframe_dummies_drop_first(self, df, sparse): tm.assert_frame_equal(result, expected) def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype): - df["cat"] = pd.Categorical(["x", "y", "y"]) + df["cat"] = Categorical(["x", "y", "y"]) result = get_dummies(df, drop_first=True, sparse=sparse) expected = DataFrame( {"C": [1, 2, 3], "A_b": [0, 1, 0], "B_c": [0, 0, 1], "cat_y": [0, 1, 1]} @@ -521,24 +521,24 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse): expected = expected[["C", "A_b", "B_c"]] tm.assert_frame_equal(result, expected) - def test_int_int(self): + def test_get_dummies_int_int(self): data = Series([1, 2, 1]) - result = pd.get_dummies(data) + result = get_dummies(data) expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=np.uint8) tm.assert_frame_equal(result, expected) - data = Series(pd.Categorical(["a", "b", "a"])) - result = pd.get_dummies(data) + data = Series(Categorical(["a", "b", "a"])) + result = get_dummies(data) expected = DataFrame( - [[1, 0], [0, 1], [1, 0]], columns=pd.Categorical(["a", "b"]), dtype=np.uint8 + [[1, 0], [0, 1], [1, 0]], columns=Categorical(["a", "b"]), dtype=np.uint8 ) tm.assert_frame_equal(result, expected) - def test_int_df(self, dtype): + def test_get_dummies_int_df(self, dtype): data = DataFrame( { "A": [1, 2, 1], - "B": pd.Categorical(["a", "b", "a"]), + "B": Categorical(["a", "b", "a"]), "C": [1, 2, 1], "D": [1.0, 2.0, 1.0], } @@ -549,22 +549,22 @@ def test_int_df(self, dtype): columns=columns, ) expected[columns[2:]] = expected[columns[2:]].astype(dtype) - result = pd.get_dummies(data, columns=["A", "B"], dtype=dtype) + result = get_dummies(data, columns=["A", "B"], dtype=dtype) tm.assert_frame_equal(result, expected) - def test_dataframe_dummies_preserve_categorical_dtype(self, dtype): + @pytest.mark.parametrize("ordered", [True, False]) + def test_dataframe_dummies_preserve_categorical_dtype(self, dtype, ordered): # GH13854 - for ordered in [False, True]: - cat = pd.Categorical(list("xy"), categories=list("xyz"), ordered=ordered) - result = get_dummies(cat, dtype=dtype) + cat = Categorical(list("xy"), categories=list("xyz"), ordered=ordered) + result = get_dummies(cat, dtype=dtype) - data = np.array([[1, 0, 0], [0, 1, 0]], dtype=self.effective_dtype(dtype)) - cols = pd.CategoricalIndex( - cat.categories, categories=cat.categories, ordered=ordered - ) - expected = DataFrame(data, columns=cols, dtype=self.effective_dtype(dtype)) + data = np.array([[1, 0, 0], [0, 1, 0]], dtype=self.effective_dtype(dtype)) + cols = CategoricalIndex( + cat.categories, categories=cat.categories, ordered=ordered + ) + expected = DataFrame(data, columns=cols, dtype=self.effective_dtype(dtype)) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("sparse", [True, False]) def test_get_dummies_dont_sparsify_all_columns(self, sparse): @@ -593,10 +593,10 @@ def test_get_dummies_duplicate_columns(self, df): tm.assert_frame_equal(result, expected) def test_get_dummies_all_sparse(self): - df = pd.DataFrame({"A": [1, 2]}) - result = pd.get_dummies(df, columns=["A"], sparse=True) + df = DataFrame({"A": [1, 2]}) + result = get_dummies(df, columns=["A"], sparse=True) dtype = SparseDtype("uint8", 0) - expected = pd.DataFrame( + expected = DataFrame( { "A_1": SparseArray([1, 0], dtype=dtype), "A_2": SparseArray([0, 1], dtype=dtype), @@ -607,7 +607,7 @@ def test_get_dummies_all_sparse(self): @pytest.mark.parametrize("values", ["baz"]) def test_get_dummies_with_string_values(self, values): # issue #28383 - df = pd.DataFrame( + df = DataFrame( { "bar": [1, 2, 3, 4, 5, 6], "foo": ["one", "one", "one", "two", "two", "two"], @@ -619,26 +619,4 @@ def test_get_dummies_with_string_values(self, values): msg = "Input must be a list-like for parameter `columns`" with pytest.raises(TypeError, match=msg): - pd.get_dummies(df, columns=values) - - -class TestCategoricalReshape: - def test_reshaping_multi_index_categorical(self): - - cols = ["ItemA", "ItemB", "ItemC"] - data = {c: tm.makeTimeDataFrame() for c in cols} - df = pd.concat({c: data[c].stack() for c in data}, axis="columns") - df.index.names = ["major", "minor"] - df["str"] = "foo" - - df["category"] = df["str"].astype("category") - result = df["category"].unstack() - - dti = df.index.levels[0] - c = Categorical(["foo"] * len(dti)) - expected = DataFrame( - {"A": c.copy(), "B": c.copy(), "C": c.copy(), "D": c.copy()}, - columns=Index(list("ABCD"), name="minor"), - index=dti.rename("major"), - ) - tm.assert_frame_equal(result, expected) + get_dummies(df, columns=values) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index e09a2a7907177..e49b80e476003 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -17,7 +17,7 @@ ) import pandas._testing as tm from pandas.api.types import CategoricalDtype as CDT -from pandas.core.reshape.pivot import crosstab, pivot_table +from pandas.core.reshape.pivot import pivot_table @pytest.fixture(params=[True, False]) @@ -1026,6 +1026,14 @@ def test_pivot_table_multiindex_only(self, cols): tm.assert_frame_equal(result, expected) + def test_pivot_table_retains_tz(self): + dti = date_range("2016-01-01", periods=3, tz="Europe/Amsterdam") + df = DataFrame({"A": np.random.randn(3), "B": np.random.randn(3), "C": dti}) + result = df.pivot_table(index=["B", "C"], dropna=False) + + # check tz retention + assert result.index.levels[1].equals(dti) + def test_pivot_integer_columns(self): # caused by upstream bug in unstack @@ -1748,18 +1756,14 @@ def test_margins_casted_to_float(self, observed): ) tm.assert_frame_equal(result, expected) - def test_pivot_with_categorical(self, observed, ordered_fixture): + def test_pivot_with_categorical(self, observed, ordered): # gh-21370 idx = [np.nan, "low", "high", "low", np.nan] col = [np.nan, "A", "B", np.nan, "A"] df = pd.DataFrame( { - "In": pd.Categorical( - idx, categories=["low", "high"], ordered=ordered_fixture - ), - "Col": pd.Categorical( - col, categories=["A", "B"], ordered=ordered_fixture - ), + "In": pd.Categorical(idx, categories=["low", "high"], ordered=ordered), + "Col": pd.Categorical(col, categories=["A", "B"], ordered=ordered), "Val": range(1, 6), } ) @@ -1768,16 +1772,14 @@ def test_pivot_with_categorical(self, observed, ordered_fixture): index="In", columns="Col", values="Val", observed=observed ) - expected_cols = pd.CategoricalIndex( - ["A", "B"], ordered=ordered_fixture, name="Col" - ) + expected_cols = pd.CategoricalIndex(["A", "B"], ordered=ordered, name="Col") expected = pd.DataFrame( data=[[2.0, np.nan], [np.nan, 3.0]], columns=expected_cols ) expected.index = Index( pd.Categorical( - ["low", "high"], categories=["low", "high"], ordered=ordered_fixture + ["low", "high"], categories=["low", "high"], ordered=ordered ), name="In", ) @@ -2064,708 +2066,3 @@ def agg(l): ) with pytest.raises(KeyError, match="notpresent"): foo.pivot_table("notpresent", "X", "Y", aggfunc=agg) - - -class TestCrosstab: - def setup_method(self, method): - df = DataFrame( - { - "A": [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - "foo", - ], - "B": [ - "one", - "one", - "one", - "two", - "one", - "one", - "one", - "two", - "two", - "two", - "one", - ], - "C": [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - "D": np.random.randn(11), - "E": np.random.randn(11), - "F": np.random.randn(11), - } - ) - - self.df = df.append(df, ignore_index=True) - - def test_crosstab_single(self): - df = self.df - result = crosstab(df["A"], df["C"]) - expected = df.groupby(["A", "C"]).size().unstack() - tm.assert_frame_equal(result, expected.fillna(0).astype(np.int64)) - - def test_crosstab_multiple(self): - df = self.df - - result = crosstab(df["A"], [df["B"], df["C"]]) - expected = df.groupby(["A", "B", "C"]).size() - expected = expected.unstack("B").unstack("C").fillna(0).astype(np.int64) - tm.assert_frame_equal(result, expected) - - result = crosstab([df["B"], df["C"]], df["A"]) - expected = df.groupby(["B", "C", "A"]).size() - expected = expected.unstack("A").fillna(0).astype(np.int64) - tm.assert_frame_equal(result, expected) - - def test_crosstab_ndarray(self): - a = np.random.randint(0, 5, size=100) - b = np.random.randint(0, 3, size=100) - c = np.random.randint(0, 10, size=100) - - df = DataFrame({"a": a, "b": b, "c": c}) - - result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c")) - expected = crosstab(df["a"], [df["b"], df["c"]]) - tm.assert_frame_equal(result, expected) - - result = crosstab([b, c], a, colnames=["a"], rownames=("b", "c")) - expected = crosstab([df["b"], df["c"]], df["a"]) - tm.assert_frame_equal(result, expected) - - # assign arbitrary names - result = crosstab(self.df["A"].values, self.df["C"].values) - assert result.index.name == "row_0" - assert result.columns.name == "col_0" - - def test_crosstab_non_aligned(self): - # GH 17005 - a = pd.Series([0, 1, 1], index=["a", "b", "c"]) - b = pd.Series([3, 4, 3, 4, 3], index=["a", "b", "c", "d", "f"]) - c = np.array([3, 4, 3]) - - expected = pd.DataFrame( - [[1, 0], [1, 1]], - index=Index([0, 1], name="row_0"), - columns=Index([3, 4], name="col_0"), - ) - - result = crosstab(a, b) - tm.assert_frame_equal(result, expected) - - result = crosstab(a, c) - tm.assert_frame_equal(result, expected) - - def test_crosstab_margins(self): - a = np.random.randint(0, 7, size=100) - b = np.random.randint(0, 3, size=100) - c = np.random.randint(0, 5, size=100) - - df = DataFrame({"a": a, "b": b, "c": c}) - - result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"), margins=True) - - assert result.index.names == ("a",) - assert result.columns.names == ["b", "c"] - - all_cols = result["All", ""] - exp_cols = df.groupby(["a"]).size().astype("i8") - # to keep index.name - exp_margin = Series([len(df)], index=Index(["All"], name="a")) - exp_cols = exp_cols.append(exp_margin) - exp_cols.name = ("All", "") - - tm.assert_series_equal(all_cols, exp_cols) - - all_rows = result.loc["All"] - exp_rows = df.groupby(["b", "c"]).size().astype("i8") - exp_rows = exp_rows.append(Series([len(df)], index=[("All", "")])) - exp_rows.name = "All" - - exp_rows = exp_rows.reindex(all_rows.index) - exp_rows = exp_rows.fillna(0).astype(np.int64) - tm.assert_series_equal(all_rows, exp_rows) - - def test_crosstab_margins_set_margin_name(self): - # GH 15972 - a = np.random.randint(0, 7, size=100) - b = np.random.randint(0, 3, size=100) - c = np.random.randint(0, 5, size=100) - - df = DataFrame({"a": a, "b": b, "c": c}) - - result = crosstab( - a, - [b, c], - rownames=["a"], - colnames=("b", "c"), - margins=True, - margins_name="TOTAL", - ) - - assert result.index.names == ("a",) - assert result.columns.names == ["b", "c"] - - all_cols = result["TOTAL", ""] - exp_cols = df.groupby(["a"]).size().astype("i8") - # to keep index.name - exp_margin = Series([len(df)], index=Index(["TOTAL"], name="a")) - exp_cols = exp_cols.append(exp_margin) - exp_cols.name = ("TOTAL", "") - - tm.assert_series_equal(all_cols, exp_cols) - - all_rows = result.loc["TOTAL"] - exp_rows = df.groupby(["b", "c"]).size().astype("i8") - exp_rows = exp_rows.append(Series([len(df)], index=[("TOTAL", "")])) - exp_rows.name = "TOTAL" - - exp_rows = exp_rows.reindex(all_rows.index) - exp_rows = exp_rows.fillna(0).astype(np.int64) - tm.assert_series_equal(all_rows, exp_rows) - - msg = "margins_name argument must be a string" - for margins_name in [666, None, ["a", "b"]]: - with pytest.raises(ValueError, match=msg): - crosstab( - a, - [b, c], - rownames=["a"], - colnames=("b", "c"), - margins=True, - margins_name=margins_name, - ) - - def test_crosstab_pass_values(self): - a = np.random.randint(0, 7, size=100) - b = np.random.randint(0, 3, size=100) - c = np.random.randint(0, 5, size=100) - values = np.random.randn(100) - - table = crosstab( - [a, b], c, values, aggfunc=np.sum, rownames=["foo", "bar"], colnames=["baz"] - ) - - df = DataFrame({"foo": a, "bar": b, "baz": c, "values": values}) - - expected = df.pivot_table( - "values", index=["foo", "bar"], columns="baz", aggfunc=np.sum - ) - tm.assert_frame_equal(table, expected) - - def test_crosstab_dropna(self): - # GH 3820 - a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object) - b = np.array(["one", "one", "two", "one", "two", "two", "two"], dtype=object) - c = np.array( - ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object - ) - res = pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=False) - m = MultiIndex.from_tuples( - [("one", "dull"), ("one", "shiny"), ("two", "dull"), ("two", "shiny")], - names=["b", "c"], - ) - tm.assert_index_equal(res.columns, m) - - def test_crosstab_no_overlap(self): - # GS 10291 - - s1 = pd.Series([1, 2, 3], index=[1, 2, 3]) - s2 = pd.Series([4, 5, 6], index=[4, 5, 6]) - - actual = crosstab(s1, s2) - expected = pd.DataFrame() - - tm.assert_frame_equal(actual, expected) - - def test_margin_dropna(self): - # GH 12577 - # pivot_table counts null into margin ('All') - # when margins=true and dropna=true - - df = pd.DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) - actual = pd.crosstab(df.a, df.b, margins=True, dropna=True) - expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]]) - expected.index = Index([1.0, 2.0, "All"], name="a") - expected.columns = Index([3, 4, "All"], name="b") - tm.assert_frame_equal(actual, expected) - - df = DataFrame( - {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]} - ) - actual = pd.crosstab(df.a, df.b, margins=True, dropna=True) - expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) - expected.index = Index([1.0, 2.0, "All"], name="a") - expected.columns = Index([3.0, 4.0, "All"], name="b") - tm.assert_frame_equal(actual, expected) - - df = DataFrame( - {"a": [1, np.nan, np.nan, np.nan, np.nan, 2], "b": [3, 3, 4, 4, 4, 4]} - ) - actual = pd.crosstab(df.a, df.b, margins=True, dropna=True) - expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) - expected.index = Index([1.0, 2.0, "All"], name="a") - expected.columns = Index([3, 4, "All"], name="b") - tm.assert_frame_equal(actual, expected) - - # GH 12642 - # _add_margins raises KeyError: Level None not found - # when margins=True and dropna=False - df = pd.DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) - actual = pd.crosstab(df.a, df.b, margins=True, dropna=False) - expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 4, 6]]) - expected.index = Index([1.0, 2.0, "All"], name="a") - expected.columns = Index([3, 4, "All"], name="b") - tm.assert_frame_equal(actual, expected) - - df = DataFrame( - {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]} - ) - actual = pd.crosstab(df.a, df.b, margins=True, dropna=False) - expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 4, 6]]) - expected.index = Index([1.0, 2.0, "All"], name="a") - expected.columns = Index([3.0, 4.0, "All"], name="b") - tm.assert_frame_equal(actual, expected) - - a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object) - b = np.array(["one", "one", "two", "one", "two", np.nan, "two"], dtype=object) - c = np.array( - ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object - ) - - actual = pd.crosstab( - a, [b, c], rownames=["a"], colnames=["b", "c"], margins=True, dropna=False - ) - m = MultiIndex.from_arrays( - [ - ["one", "one", "two", "two", "All"], - ["dull", "shiny", "dull", "shiny", ""], - ], - names=["b", "c"], - ) - expected = DataFrame( - [[1, 0, 1, 0, 2], [2, 0, 1, 1, 5], [3, 0, 2, 1, 7]], columns=m - ) - expected.index = Index(["bar", "foo", "All"], name="a") - tm.assert_frame_equal(actual, expected) - - actual = pd.crosstab( - [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=False - ) - m = MultiIndex.from_arrays( - [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]], - names=["a", "b"], - ) - expected = DataFrame( - [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 2, 7]], index=m - ) - expected.columns = Index(["dull", "shiny", "All"], name="c") - tm.assert_frame_equal(actual, expected) - - actual = pd.crosstab( - [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=True - ) - m = MultiIndex.from_arrays( - [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]], - names=["a", "b"], - ) - expected = DataFrame( - [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 1, 6]], index=m - ) - expected.columns = Index(["dull", "shiny", "All"], name="c") - tm.assert_frame_equal(actual, expected) - - def test_crosstab_normalize(self): - # Issue 12578 - df = pd.DataFrame( - {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]} - ) - - rindex = pd.Index([1, 2], name="a") - cindex = pd.Index([3, 4], name="b") - full_normal = pd.DataFrame([[0.2, 0], [0.2, 0.6]], index=rindex, columns=cindex) - row_normal = pd.DataFrame( - [[1.0, 0], [0.25, 0.75]], index=rindex, columns=cindex - ) - col_normal = pd.DataFrame([[0.5, 0], [0.5, 1.0]], index=rindex, columns=cindex) - - # Check all normalize args - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize="all"), full_normal) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True), full_normal) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize="index"), row_normal) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize="columns"), col_normal) - tm.assert_frame_equal( - pd.crosstab(df.a, df.b, normalize=1), - pd.crosstab(df.a, df.b, normalize="columns"), - ) - tm.assert_frame_equal( - pd.crosstab(df.a, df.b, normalize=0), - pd.crosstab(df.a, df.b, normalize="index"), - ) - - row_normal_margins = pd.DataFrame( - [[1.0, 0], [0.25, 0.75], [0.4, 0.6]], - index=pd.Index([1, 2, "All"], name="a", dtype="object"), - columns=pd.Index([3, 4], name="b", dtype="object"), - ) - col_normal_margins = pd.DataFrame( - [[0.5, 0, 0.2], [0.5, 1.0, 0.8]], - index=pd.Index([1, 2], name="a", dtype="object"), - columns=pd.Index([3, 4, "All"], name="b", dtype="object"), - ) - - all_normal_margins = pd.DataFrame( - [[0.2, 0, 0.2], [0.2, 0.6, 0.8], [0.4, 0.6, 1]], - index=pd.Index([1, 2, "All"], name="a", dtype="object"), - columns=pd.Index([3, 4, "All"], name="b", dtype="object"), - ) - tm.assert_frame_equal( - pd.crosstab(df.a, df.b, normalize="index", margins=True), row_normal_margins - ) - tm.assert_frame_equal( - pd.crosstab(df.a, df.b, normalize="columns", margins=True), - col_normal_margins, - ) - tm.assert_frame_equal( - pd.crosstab(df.a, df.b, normalize=True, margins=True), all_normal_margins - ) - - # Test arrays - pd.crosstab( - [np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])], np.array([1, 2, 1, 2]) - ) - - # Test with aggfunc - norm_counts = pd.DataFrame( - [[0.25, 0, 0.25], [0.25, 0.5, 0.75], [0.5, 0.5, 1]], - index=pd.Index([1, 2, "All"], name="a", dtype="object"), - columns=pd.Index([3, 4, "All"], name="b"), - ) - test_case = pd.crosstab( - df.a, df.b, df.c, aggfunc="count", normalize="all", margins=True - ) - tm.assert_frame_equal(test_case, norm_counts) - - df = pd.DataFrame( - {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [0, 4, np.nan, 3, 3]} - ) - - norm_sum = pd.DataFrame( - [[0, 0, 0.0], [0.4, 0.6, 1], [0.4, 0.6, 1]], - index=pd.Index([1, 2, "All"], name="a", dtype="object"), - columns=pd.Index([3, 4, "All"], name="b", dtype="object"), - ) - test_case = pd.crosstab( - df.a, df.b, df.c, aggfunc=np.sum, normalize="all", margins=True - ) - tm.assert_frame_equal(test_case, norm_sum) - - def test_crosstab_with_empties(self): - # Check handling of empties - df = pd.DataFrame( - { - "a": [1, 2, 2, 2, 2], - "b": [3, 3, 4, 4, 4], - "c": [np.nan, np.nan, np.nan, np.nan, np.nan], - } - ) - - empty = pd.DataFrame( - [[0.0, 0.0], [0.0, 0.0]], - index=pd.Index([1, 2], name="a", dtype="int64"), - columns=pd.Index([3, 4], name="b"), - ) - - for i in [True, "index", "columns"]: - calculated = pd.crosstab( - df.a, df.b, values=df.c, aggfunc="count", normalize=i - ) - tm.assert_frame_equal(empty, calculated) - - nans = pd.DataFrame( - [[0.0, np.nan], [0.0, 0.0]], - index=pd.Index([1, 2], name="a", dtype="int64"), - columns=pd.Index([3, 4], name="b"), - ) - - calculated = pd.crosstab( - df.a, df.b, values=df.c, aggfunc="count", normalize=False - ) - tm.assert_frame_equal(nans, calculated) - - def test_crosstab_errors(self): - # Issue 12578 - - df = pd.DataFrame( - {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]} - ) - - error = "values cannot be used without an aggfunc." - with pytest.raises(ValueError, match=error): - pd.crosstab(df.a, df.b, values=df.c) - - error = "aggfunc cannot be used without values" - with pytest.raises(ValueError, match=error): - pd.crosstab(df.a, df.b, aggfunc=np.mean) - - error = "Not a valid normalize argument" - with pytest.raises(ValueError, match=error): - pd.crosstab(df.a, df.b, normalize="42") - - with pytest.raises(ValueError, match=error): - pd.crosstab(df.a, df.b, normalize=42) - - error = "Not a valid margins argument" - with pytest.raises(ValueError, match=error): - pd.crosstab(df.a, df.b, normalize="all", margins=42) - - def test_crosstab_with_categorial_columns(self): - # GH 8860 - df = pd.DataFrame( - { - "MAKE": ["Honda", "Acura", "Tesla", "Honda", "Honda", "Acura"], - "MODEL": ["Sedan", "Sedan", "Electric", "Pickup", "Sedan", "Sedan"], - } - ) - categories = ["Sedan", "Electric", "Pickup"] - df["MODEL"] = df["MODEL"].astype("category").cat.set_categories(categories) - result = pd.crosstab(df["MAKE"], df["MODEL"]) - - expected_index = pd.Index(["Acura", "Honda", "Tesla"], name="MAKE") - expected_columns = pd.CategoricalIndex( - categories, categories=categories, ordered=False, name="MODEL" - ) - expected_data = [[2, 0, 0], [2, 0, 1], [0, 1, 0]] - expected = pd.DataFrame( - expected_data, index=expected_index, columns=expected_columns - ) - tm.assert_frame_equal(result, expected) - - def test_crosstab_with_numpy_size(self): - # GH 4003 - df = pd.DataFrame( - { - "A": ["one", "one", "two", "three"] * 6, - "B": ["A", "B", "C"] * 8, - "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, - "D": np.random.randn(24), - "E": np.random.randn(24), - } - ) - result = pd.crosstab( - index=[df["A"], df["B"]], - columns=[df["C"]], - margins=True, - aggfunc=np.size, - values=df["D"], - ) - expected_index = pd.MultiIndex( - levels=[["All", "one", "three", "two"], ["", "A", "B", "C"]], - codes=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0], [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]], - names=["A", "B"], - ) - expected_column = pd.Index(["bar", "foo", "All"], dtype="object", name="C") - expected_data = np.array( - [ - [2.0, 2.0, 4.0], - [2.0, 2.0, 4.0], - [2.0, 2.0, 4.0], - [2.0, np.nan, 2.0], - [np.nan, 2.0, 2.0], - [2.0, np.nan, 2.0], - [np.nan, 2.0, 2.0], - [2.0, np.nan, 2.0], - [np.nan, 2.0, 2.0], - [12.0, 12.0, 24.0], - ] - ) - expected = pd.DataFrame( - expected_data, index=expected_index, columns=expected_column - ) - tm.assert_frame_equal(result, expected) - - def test_crosstab_dup_index_names(self): - # GH 13279 - s = pd.Series(range(3), name="foo") - - result = pd.crosstab(s, s) - expected_index = pd.Index(range(3), name="foo") - expected = pd.DataFrame( - np.eye(3, dtype=np.int64), index=expected_index, columns=expected_index - ) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]]) - def test_crosstab_tuple_name(self, names): - s1 = pd.Series(range(3), name=names[0]) - s2 = pd.Series(range(1, 4), name=names[1]) - - mi = pd.MultiIndex.from_arrays([range(3), range(1, 4)], names=names) - expected = pd.Series(1, index=mi).unstack(1, fill_value=0) - - result = pd.crosstab(s1, s2) - tm.assert_frame_equal(result, expected) - - def test_crosstab_both_tuple_names(self): - # GH 18321 - s1 = pd.Series(range(3), name=("a", "b")) - s2 = pd.Series(range(3), name=("c", "d")) - - expected = pd.DataFrame( - np.eye(3, dtype="int64"), - index=pd.Index(range(3), name=("a", "b")), - columns=pd.Index(range(3), name=("c", "d")), - ) - result = crosstab(s1, s2) - tm.assert_frame_equal(result, expected) - - def test_crosstab_unsorted_order(self): - df = pd.DataFrame({"b": [3, 1, 2], "a": [5, 4, 6]}, index=["C", "A", "B"]) - result = pd.crosstab(df.index, [df.b, df.a]) - e_idx = pd.Index(["A", "B", "C"], name="row_0") - e_columns = pd.MultiIndex.from_tuples( - [(1, 4), (2, 6), (3, 5)], names=["b", "a"] - ) - expected = pd.DataFrame( - [[1, 0, 0], [0, 1, 0], [0, 0, 1]], index=e_idx, columns=e_columns - ) - tm.assert_frame_equal(result, expected) - - def test_crosstab_normalize_multiple_columns(self): - # GH 15150 - df = pd.DataFrame( - { - "A": ["one", "one", "two", "three"] * 6, - "B": ["A", "B", "C"] * 8, - "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, - "D": [0] * 24, - "E": [0] * 24, - } - ) - result = pd.crosstab( - [df.A, df.B], - df.C, - values=df.D, - aggfunc=np.sum, - normalize=True, - margins=True, - ) - expected = pd.DataFrame( - np.array([0] * 29 + [1], dtype=float).reshape(10, 3), - columns=Index(["bar", "foo", "All"], dtype="object", name="C"), - index=MultiIndex.from_tuples( - [ - ("one", "A"), - ("one", "B"), - ("one", "C"), - ("three", "A"), - ("three", "B"), - ("three", "C"), - ("two", "A"), - ("two", "B"), - ("two", "C"), - ("All", ""), - ], - names=["A", "B"], - ), - ) - tm.assert_frame_equal(result, expected) - - def test_margin_normalize(self): - # GH 27500 - df = pd.DataFrame( - { - "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], - "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], - "C": [ - "small", - "large", - "large", - "small", - "small", - "large", - "small", - "small", - "large", - ], - "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], - "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], - } - ) - # normalize on index - result = pd.crosstab( - [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=0 - ) - expected = pd.DataFrame( - [[0.5, 0.5], [0.5, 0.5], [0.666667, 0.333333], [0, 1], [0.444444, 0.555556]] - ) - expected.index = MultiIndex( - levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], - codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], - names=["A", "B"], - ) - expected.columns = Index(["large", "small"], dtype="object", name="C") - tm.assert_frame_equal(result, expected) - - # normalize on columns - result = pd.crosstab( - [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=1 - ) - expected = pd.DataFrame( - [ - [0.25, 0.2, 0.222222], - [0.25, 0.2, 0.222222], - [0.5, 0.2, 0.333333], - [0, 0.4, 0.222222], - ] - ) - expected.columns = Index( - ["large", "small", "Sub-Total"], dtype="object", name="C" - ) - expected.index = MultiIndex( - levels=[["bar", "foo"], ["one", "two"]], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]], - names=["A", "B"], - ) - tm.assert_frame_equal(result, expected) - - # normalize on both index and column - result = pd.crosstab( - [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=True - ) - expected = pd.DataFrame( - [ - [0.111111, 0.111111, 0.222222], - [0.111111, 0.111111, 0.222222], - [0.222222, 0.111111, 0.333333], - [0.000000, 0.222222, 0.222222], - [0.444444, 0.555555, 1], - ] - ) - expected.columns = Index( - ["large", "small", "Sub-Total"], dtype="object", name="C" - ) - expected.index = MultiIndex( - levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], - codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], - names=["A", "B"], - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py index a503173bd74b1..8918d19e4ba7b 100644 --- a/pandas/tests/reshape/test_union_categoricals.py +++ b/pandas/tests/reshape/test_union_categoricals.py @@ -41,7 +41,7 @@ def test_union_categorical(self): for box in [Categorical, CategoricalIndex, Series]: result = union_categoricals([box(Categorical(a)), box(Categorical(b))]) expected = Categorical(combined) - tm.assert_categorical_equal(result, expected, check_category_order=True) + tm.assert_categorical_equal(result, expected) # new categories ordered by appearance s = Categorical(["x", "y", "z"]) diff --git a/pandas/tests/reshape/test_util.py b/pandas/tests/reshape/test_util.py index cd518dda4edbf..9d074b5ade425 100644 --- a/pandas/tests/reshape/test_util.py +++ b/pandas/tests/reshape/test_util.py @@ -25,6 +25,22 @@ def test_datetimeindex(self): tm.assert_index_equal(result1, expected1) tm.assert_index_equal(result2, expected2) + def test_tzaware_retained(self): + x = date_range("2000-01-01", periods=2, tz="US/Pacific") + y = np.array([3, 4]) + result1, result2 = cartesian_product([x, y]) + + expected = x.repeat(2) + tm.assert_index_equal(result1, expected) + + def test_tzaware_retained_categorical(self): + x = date_range("2000-01-01", periods=2, tz="US/Pacific").astype("category") + y = np.array([3, 4]) + result1, result2 = cartesian_product([x, y]) + + expected = x.repeat(2) + tm.assert_index_equal(result1, expected) + def test_empty(self): # product of empty factors X = [[], [0, 1], []] diff --git a/pandas/tests/scalar/interval/test_interval.py b/pandas/tests/scalar/interval/test_interval.py index b51429d0338e3..b21e98827ca92 100644 --- a/pandas/tests/scalar/interval/test_interval.py +++ b/pandas/tests/scalar/interval/test_interval.py @@ -49,7 +49,8 @@ def test_equal(self): assert Interval(0, 1) != 0 def test_comparison(self): - with pytest.raises(TypeError, match="unorderable types"): + msg = "unorderable types" + with pytest.raises(TypeError, match=msg): Interval(0, 1) < 2 assert Interval(0, 1) < Interval(1, 2) @@ -254,6 +255,12 @@ def test_constructor_errors_tz(self, tz_left, tz_right): # GH 18538 left = Timestamp("2017-01-01", tz=tz_left) right = Timestamp("2017-01-02", tz=tz_right) - error = TypeError if com.any_none(tz_left, tz_right) else ValueError - with pytest.raises(error): + + if com.any_none(tz_left, tz_right): + error = TypeError + msg = "Cannot compare tz-naive and tz-aware timestamps" + else: + error = ValueError + msg = "left and right must have the same time zone" + with pytest.raises(error, match=msg): Interval(left, right) diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index 436810042186a..b9f637c178d53 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -33,7 +33,8 @@ def test_asfreq_near_zero_weekly(self): def test_to_timestamp_out_of_bounds(self): # GH#19643, used to incorrectly give Timestamp in 1754 per = Period("0001-01-01", freq="B") - with pytest.raises(OutOfBoundsDatetime): + msg = "Out of bounds nanosecond timestamp" + with pytest.raises(OutOfBoundsDatetime, match=msg): per.to_timestamp() def test_asfreq_corner(self): @@ -668,9 +669,10 @@ def test_conv_microsecond(self): assert start.value == per.ordinal * 1000 per2 = Period("2300-01-01", "us") - with pytest.raises(OutOfBoundsDatetime, match="2300-01-01"): + msg = "2300-01-01" + with pytest.raises(OutOfBoundsDatetime, match=msg): per2.start_time - with pytest.raises(OutOfBoundsDatetime, match="2300-01-01"): + with pytest.raises(OutOfBoundsDatetime, match=msg): per2.end_time def test_asfreq_mult(self): diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 3846274dacd75..304033f82c7a2 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -79,7 +79,8 @@ def test_construction(self): with pytest.raises(ValueError, match=msg): Period(ordinal=200701) - with pytest.raises(ValueError, match="Invalid frequency: X"): + msg = "Invalid frequency: X" + with pytest.raises(ValueError, match=msg): Period("2007-1-1", freq="X") def test_construction_bday(self): @@ -235,26 +236,34 @@ def test_period_constructor_offsets(self): assert i1 == expected def test_invalid_arguments(self): - with pytest.raises(ValueError): + msg = "Must supply freq for datetime value" + with pytest.raises(ValueError, match=msg): Period(datetime.now()) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Period(datetime.now().date()) - with pytest.raises(ValueError): + msg = "Value must be Period, string, integer, or datetime" + with pytest.raises(ValueError, match=msg): Period(1.6, freq="D") - with pytest.raises(ValueError): + msg = "Ordinal must be an integer" + with pytest.raises(ValueError, match=msg): Period(ordinal=1.6, freq="D") - with pytest.raises(ValueError): + msg = "Only value or ordinal but not both should be given but not both" + with pytest.raises(ValueError, match=msg): Period(ordinal=2, value=1, freq="D") - with pytest.raises(ValueError): + msg = "If value is None, freq cannot be None" + with pytest.raises(ValueError, match=msg): Period(month=1) - with pytest.raises(ValueError): + msg = "Given date string not likely a datetime" + with pytest.raises(ValueError, match=msg): Period("-2000", "A") - with pytest.raises(DateParseError): + msg = "day is out of range for month" + with pytest.raises(DateParseError, match=msg): Period("0", "A") - with pytest.raises(DateParseError): + msg = "Unknown datetime string format, unable to parse" + with pytest.raises(DateParseError, match=msg): Period("1/1/-2000", "A") def test_constructor_corner(self): @@ -347,10 +356,18 @@ def test_period_from_ordinal(self): assert p == res assert isinstance(res, Period) - def test_period_cons_nat(self): - p = Period("NaT", freq="M") - assert p is NaT + @pytest.mark.parametrize("freq", ["A", "M", "D", "H"]) + def test_construct_from_nat_string_and_freq(self, freq): + per = Period("NaT", freq=freq) + assert per is NaT + + per = Period("NaT", freq="2" + freq) + assert per is NaT + + per = Period("NaT", freq="3" + freq) + assert per is NaT + def test_period_cons_nat(self): p = Period("nat", freq="W-SUN") assert p is NaT @@ -930,87 +947,83 @@ def test_get_period_field_array_raises_on_out_of_range(self): libperiod.get_period_field_arr(-1, np.empty(1), 0) -class TestComparisons: - def setup_method(self, method): - self.january1 = Period("2000-01", "M") - self.january2 = Period("2000-01", "M") - self.february = Period("2000-02", "M") - self.march = Period("2000-03", "M") - self.day = Period("2012-01-01", "D") - - def test_equal(self): - assert self.january1 == self.january2 - - def test_equal_Raises_Value(self): - with pytest.raises(IncompatibleFrequency): - self.january1 == self.day - - def test_notEqual(self): - assert self.january1 != 1 - assert self.january1 != self.february +class TestPeriodComparisons: + def test_comparison_same_period_different_object(self): + # Separate Period objects for the same period + left = Period("2000-01", "M") + right = Period("2000-01", "M") - def test_greater(self): - assert self.february > self.january1 + assert left == right + assert left >= right + assert left <= right + assert not left < right + assert not left > right - def test_greater_Raises_Value(self): - with pytest.raises(IncompatibleFrequency): - self.january1 > self.day + def test_comparison_same_freq(self): + jan = Period("2000-01", "M") + feb = Period("2000-02", "M") - def test_greater_Raises_Type(self): - with pytest.raises(TypeError): - self.january1 > 1 + assert not jan == feb + assert jan != feb + assert jan < feb + assert jan <= feb + assert not jan > feb + assert not jan >= feb - def test_greaterEqual(self): - assert self.january1 >= self.january2 + def test_comparison_mismatched_freq(self): + jan = Period("2000-01", "M") + day = Period("2012-01-01", "D") - def test_greaterEqual_Raises_Value(self): - with pytest.raises(IncompatibleFrequency): - self.january1 >= self.day - - with pytest.raises(TypeError): - print(self.january1 >= 1) - - def test_smallerEqual(self): - assert self.january1 <= self.january2 - - def test_smallerEqual_Raises_Value(self): - with pytest.raises(IncompatibleFrequency): - self.january1 <= self.day - - def test_smallerEqual_Raises_Type(self): - with pytest.raises(TypeError): - self.january1 <= 1 - - def test_smaller(self): - assert self.january1 < self.february - - def test_smaller_Raises_Value(self): - with pytest.raises(IncompatibleFrequency): - self.january1 < self.day - - def test_smaller_Raises_Type(self): - with pytest.raises(TypeError): - self.january1 < 1 - - def test_sort(self): - periods = [self.march, self.january1, self.february] - correctPeriods = [self.january1, self.february, self.march] + msg = r"Input has different freq=D from Period\(freq=M\)" + with pytest.raises(IncompatibleFrequency, match=msg): + jan == day + with pytest.raises(IncompatibleFrequency, match=msg): + jan != day + with pytest.raises(IncompatibleFrequency, match=msg): + jan < day + with pytest.raises(IncompatibleFrequency, match=msg): + jan <= day + with pytest.raises(IncompatibleFrequency, match=msg): + jan > day + with pytest.raises(IncompatibleFrequency, match=msg): + jan >= day + + def test_comparison_invalid_type(self): + jan = Period("2000-01", "M") + + assert not jan == 1 + assert jan != 1 + + msg = "Cannot compare type Period with type int" + for left, right in [(jan, 1), (1, jan)]: + + with pytest.raises(TypeError, match=msg): + left > right + with pytest.raises(TypeError, match=msg): + left >= right + with pytest.raises(TypeError, match=msg): + left < right + with pytest.raises(TypeError, match=msg): + left <= right + + def test_sort_periods(self): + jan = Period("2000-01", "M") + feb = Period("2000-02", "M") + mar = Period("2000-03", "M") + periods = [mar, jan, feb] + correctPeriods = [jan, feb, mar] assert sorted(periods) == correctPeriods - def test_period_nat_comp(self): - p_nat = Period("NaT", freq="D") + def test_period_cmp_nat(self): p = Period("2011-01-01", freq="D") - nat = Timestamp("NaT") t = Timestamp("2011-01-01") # confirm Period('NaT') work identical with Timestamp('NaT') for left, right in [ - (p_nat, p), - (p, p_nat), - (p_nat, p_nat), - (nat, t), - (t, nat), - (nat, nat), + (NaT, p), + (p, NaT), + (NaT, t), + (t, NaT), ]: assert not left < right assert not left > right @@ -1026,7 +1039,8 @@ def test_sub_delta(self): result = left - right assert result == 4 * right.freq - with pytest.raises(IncompatibleFrequency): + msg = r"Input has different freq=M from Period\(freq=A-DEC\)" + with pytest.raises(IncompatibleFrequency, match=msg): left - Period("2007-01", freq="M") def test_add_integer(self): @@ -1043,13 +1057,6 @@ def test_add_sub_nat(self): assert p - NaT is NaT assert NaT - p is NaT - p = Period("NaT", freq="M") - assert p is NaT - assert p + NaT is NaT - assert NaT + p is NaT - assert p - NaT is NaT - assert NaT - p is NaT - def test_add_invalid(self): # GH#4731 per1 = Period(freq="D", year=2008, month=1, day=1) @@ -1075,10 +1082,14 @@ def test_add_timestamp_raises(self, rbox, lbox): # We may get a different message depending on which class raises # the error. - msg = ( - r"cannot add|unsupported operand|" - r"can only operate on a|incompatible type|" - r"ufunc add cannot use operands" + msg = "|".join( + [ + "cannot add", + "unsupported operand", + "can only operate on a", + "incompatible type", + "ufunc add cannot use operands", + ] ) with pytest.raises(TypeError, match=msg): lbox(ts) + rbox(per) @@ -1151,14 +1162,22 @@ def test_add_offset(self): np.timedelta64(365, "D"), timedelta(365), ]: - with pytest.raises(IncompatibleFrequency): + msg = "Input has different freq|Input cannot be converted to Period" + with pytest.raises(IncompatibleFrequency, match=msg): p + o if isinstance(o, np.timedelta64): - with pytest.raises(TypeError): + msg = "cannot use operands with types" + with pytest.raises(TypeError, match=msg): o + p else: - with pytest.raises(IncompatibleFrequency): + msg = "|".join( + [ + "Input has different freq", + "Input cannot be converted to Period", + ] + ) + with pytest.raises(IncompatibleFrequency, match=msg): o + p for freq in ["M", "2M", "3M"]: @@ -1178,14 +1197,22 @@ def test_add_offset(self): np.timedelta64(365, "D"), timedelta(365), ]: - with pytest.raises(IncompatibleFrequency): + msg = "Input has different freq|Input cannot be converted to Period" + with pytest.raises(IncompatibleFrequency, match=msg): p + o if isinstance(o, np.timedelta64): - with pytest.raises(TypeError): + msg = "cannot use operands with types" + with pytest.raises(TypeError, match=msg): o + p else: - with pytest.raises(IncompatibleFrequency): + msg = "|".join( + [ + "Input has different freq", + "Input cannot be converted to Period", + ] + ) + with pytest.raises(IncompatibleFrequency, match=msg): o + p # freq is Tick @@ -1202,12 +1229,13 @@ def test_add_offset(self): exp = Period("2011-04-03", freq=freq) assert p + np.timedelta64(2, "D") == exp - with pytest.raises(TypeError): + msg = "cannot use operands with types" + with pytest.raises(TypeError, match=msg): np.timedelta64(2, "D") + p exp = Period("2011-04-02", freq=freq) assert p + np.timedelta64(3600 * 24, "s") == exp - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): np.timedelta64(3600 * 24, "s") + p exp = Period("2011-03-30", freq=freq) @@ -1225,14 +1253,22 @@ def test_add_offset(self): np.timedelta64(4, "h"), timedelta(hours=23), ]: - with pytest.raises(IncompatibleFrequency): + msg = "Input has different freq|Input cannot be converted to Period" + with pytest.raises(IncompatibleFrequency, match=msg): p + o if isinstance(o, np.timedelta64): - with pytest.raises(TypeError): + msg = "cannot use operands with types" + with pytest.raises(TypeError, match=msg): o + p else: - with pytest.raises(IncompatibleFrequency): + msg = "|".join( + [ + "Input has different freq", + "Input cannot be converted to Period", + ] + ) + with pytest.raises(IncompatibleFrequency, match=msg): o + p for freq in ["H", "2H", "3H"]: @@ -1246,14 +1282,15 @@ def test_add_offset(self): assert p + offsets.Hour(3) == exp assert offsets.Hour(3) + p == exp + msg = "cannot use operands with types" exp = Period("2011-04-01 12:00", freq=freq) assert p + np.timedelta64(3, "h") == exp - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): np.timedelta64(3, "h") + p exp = Period("2011-04-01 10:00", freq=freq) assert p + np.timedelta64(3600, "s") == exp - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): np.timedelta64(3600, "s") + p exp = Period("2011-04-01 11:00", freq=freq) @@ -1271,103 +1308,27 @@ def test_add_offset(self): np.timedelta64(3200, "s"), timedelta(hours=23, minutes=30), ]: - with pytest.raises(IncompatibleFrequency): + msg = "Input has different freq|Input cannot be converted to Period" + with pytest.raises(IncompatibleFrequency, match=msg): p + o if isinstance(o, np.timedelta64): - with pytest.raises(TypeError): + msg = "cannot use operands with types" + with pytest.raises(TypeError, match=msg): o + p else: - with pytest.raises(IncompatibleFrequency): + msg = "|".join( + [ + "Input has different freq", + "Input cannot be converted to Period", + ] + ) + with pytest.raises(IncompatibleFrequency, match=msg): o + p - def test_add_offset_nat(self): - # freq is DateOffset - for freq in ["A", "2A", "3A"]: - p = Period("NaT", freq=freq) - assert p is NaT - for o in [offsets.YearEnd(2)]: - assert p + o is NaT - assert o + p is NaT - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(365, "D"), - timedelta(365), - ]: - assert p + o is NaT - assert o + p is NaT - - for freq in ["M", "2M", "3M"]: - p = Period("NaT", freq=freq) - assert p is NaT - for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: - assert p + o is NaT - assert o + p is NaT - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(365, "D"), - timedelta(365), - ]: - assert p + o is NaT - assert o + p is NaT - - # freq is Tick - for freq in ["D", "2D", "3D"]: - p = Period("NaT", freq=freq) - assert p is NaT - for o in [ - offsets.Day(5), - offsets.Hour(24), - np.timedelta64(2, "D"), - np.timedelta64(3600 * 24, "s"), - timedelta(-2), - timedelta(hours=48), - ]: - assert p + o is NaT - assert o + p is NaT - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(4, "h"), - timedelta(hours=23), - ]: - assert p + o is NaT - assert o + p is NaT - - for freq in ["H", "2H", "3H"]: - p = Period("NaT", freq=freq) - assert p is NaT - for o in [ - offsets.Day(2), - offsets.Hour(3), - np.timedelta64(3, "h"), - np.timedelta64(3600, "s"), - timedelta(minutes=120), - timedelta(days=4, minutes=180), - ]: - assert p + o is NaT - assert o + p is NaT - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(3200, "s"), - timedelta(hours=23, minutes=30), - ]: - assert p + o is NaT - assert o + p is NaT - def test_sub_offset(self): # freq is DateOffset + msg = "Input has different freq|Input cannot be converted to Period" for freq in ["A", "2A", "3A"]: p = Period("2011", freq=freq) assert p - offsets.YearEnd(2) == Period("2009", freq=freq) @@ -1379,7 +1340,7 @@ def test_sub_offset(self): np.timedelta64(365, "D"), timedelta(365), ]: - with pytest.raises(IncompatibleFrequency): + with pytest.raises(IncompatibleFrequency, match=msg): p - o for freq in ["M", "2M", "3M"]: @@ -1394,7 +1355,7 @@ def test_sub_offset(self): np.timedelta64(365, "D"), timedelta(365), ]: - with pytest.raises(IncompatibleFrequency): + with pytest.raises(IncompatibleFrequency, match=msg): p - o # freq is Tick @@ -1414,7 +1375,7 @@ def test_sub_offset(self): np.timedelta64(4, "h"), timedelta(hours=23), ]: - with pytest.raises(IncompatibleFrequency): + with pytest.raises(IncompatibleFrequency, match=msg): p - o for freq in ["H", "2H", "3H"]: @@ -1437,95 +1398,13 @@ def test_sub_offset(self): np.timedelta64(3200, "s"), timedelta(hours=23, minutes=30), ]: - with pytest.raises(IncompatibleFrequency): + with pytest.raises(IncompatibleFrequency, match=msg): p - o - def test_sub_offset_nat(self): - # freq is DateOffset - for freq in ["A", "2A", "3A"]: - p = Period("NaT", freq=freq) - assert p is NaT - for o in [offsets.YearEnd(2)]: - assert p - o is NaT - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(365, "D"), - timedelta(365), - ]: - assert p - o is NaT - - for freq in ["M", "2M", "3M"]: - p = Period("NaT", freq=freq) - assert p is NaT - for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: - assert p - o is NaT - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(365, "D"), - timedelta(365), - ]: - assert p - o is NaT - - # freq is Tick - for freq in ["D", "2D", "3D"]: - p = Period("NaT", freq=freq) - assert p is NaT - for o in [ - offsets.Day(5), - offsets.Hour(24), - np.timedelta64(2, "D"), - np.timedelta64(3600 * 24, "s"), - timedelta(-2), - timedelta(hours=48), - ]: - assert p - o is NaT - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(4, "h"), - timedelta(hours=23), - ]: - assert p - o is NaT - - for freq in ["H", "2H", "3H"]: - p = Period("NaT", freq=freq) - assert p is NaT - for o in [ - offsets.Day(2), - offsets.Hour(3), - np.timedelta64(3, "h"), - np.timedelta64(3600, "s"), - timedelta(minutes=120), - timedelta(days=4, minutes=180), - ]: - assert p - o is NaT - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(3200, "s"), - timedelta(hours=23, minutes=30), - ]: - assert p - o is NaT - @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) - def test_nat_ops(self, freq): - p = Period("NaT", freq=freq) - assert p is NaT - assert p + 1 is NaT - assert 1 + p is NaT - assert p - 1 is NaT - assert p - Period("2011-01", freq=freq) is NaT - assert Period("2011-01", freq=freq) - p is NaT + def test_period_addsub_nat(self, freq): + assert NaT - Period("2011-01", freq=freq) is NaT + assert Period("2011-01", freq=freq) - NaT is NaT def test_period_ops_offset(self): p = Period("2011-04-01", freq="D") @@ -1547,12 +1426,14 @@ def test_period_ops_offset(self): def test_period_immutable(): # see gh-17116 + msg = "not writable" + per = Period("2014Q1") - with pytest.raises(AttributeError): + with pytest.raises(AttributeError, match=msg): per.ordinal = 14 freq = per.freq - with pytest.raises(AttributeError): + with pytest.raises(AttributeError, match=msg): per.freq = 2 * freq diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index dcb9d66708724..a0e3f8984fbe4 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -1,3 +1,5 @@ +import pickle + import numpy as np import pytest @@ -21,10 +23,12 @@ def test_repr(): def test_truthiness(): - with pytest.raises(TypeError): + msg = "boolean value of NA is ambiguous" + + with pytest.raises(TypeError, match=msg): bool(NA) - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): not NA @@ -143,7 +147,8 @@ def test_logical_and(): assert False & NA is False assert NA & NA is NA - with pytest.raises(TypeError): + msg = "unsupported operand type" + with pytest.raises(TypeError, match=msg): NA & 5 @@ -155,7 +160,8 @@ def test_logical_or(): assert False | NA is NA assert NA | NA is NA - with pytest.raises(TypeError): + msg = "unsupported operand type" + with pytest.raises(TypeError, match=msg): NA | 5 @@ -167,7 +173,8 @@ def test_logical_xor(): assert False ^ NA is NA assert NA ^ NA is NA - with pytest.raises(TypeError): + msg = "unsupported operand type" + with pytest.raises(TypeError, match=msg): NA ^ 5 @@ -214,7 +221,8 @@ def test_ufunc(): def test_ufunc_raises(): - with pytest.raises(ValueError, match="ufunc method 'at'"): + msg = "ufunc method 'at'" + with pytest.raises(ValueError, match=msg): np.log.at(pd.NA, 0) @@ -267,3 +275,26 @@ def test_integer_hash_collision_set(): assert len(result) == 2 assert NA in result assert hash(NA) in result + + +def test_pickle_roundtrip(): + # https://github.com/pandas-dev/pandas/issues/31847 + result = pickle.loads(pickle.dumps(pd.NA)) + assert result is pd.NA + + +def test_pickle_roundtrip_pandas(): + result = tm.round_trip_pickle(pd.NA) + assert result is pd.NA + + +@pytest.mark.parametrize( + "values, dtype", [([1, 2, pd.NA], "Int64"), (["A", "B", pd.NA], "string")] +) +@pytest.mark.parametrize("as_frame", [True, False]) +def test_pickle_roundtrip_containers(as_frame, values, dtype): + s = pd.Series(pd.array(values, dtype=dtype)) + if as_frame: + s = s.to_frame(name="A") + result = tm.round_trip_pickle(s) + tm.assert_equal(result, s) diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index a537f000959e3..0e5414a8b4d2d 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -20,6 +20,7 @@ TimedeltaIndex, Timestamp, isna, + offsets, ) import pandas._testing as tm from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray @@ -392,12 +393,14 @@ def test_nat_arithmetic_scalar(op_name, value, val_type): elif val_type == "str": # un-specific check here because the message comes from str # and varies by method - msg = ( - "can only concatenate str|" - "unsupported operand type|" - "can't multiply sequence|" - "Can't convert 'NaTType'|" - "must be str, not NaTType" + msg = "|".join( + [ + "can only concatenate str", + "unsupported operand type", + "can't multiply sequence", + "Can't convert 'NaTType'", + "must be str, not NaTType", + ] ) else: msg = "unsupported operand type" @@ -508,3 +511,38 @@ def test_nat_comparisons(compare_operators_no_eq_ne, other): # GH 26039 assert getattr(NaT, compare_operators_no_eq_ne)(other) is False assert getattr(other, compare_operators_no_eq_ne)(NaT) is False + + +@pytest.mark.parametrize( + "obj", + [ + offsets.YearEnd(2), + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.MonthEnd(2), + offsets.MonthEnd(12), + offsets.Day(2), + offsets.Day(5), + offsets.Hour(24), + offsets.Hour(3), + offsets.Minute(), + np.timedelta64(3, "h"), + np.timedelta64(4, "h"), + np.timedelta64(3200, "s"), + np.timedelta64(3600, "s"), + np.timedelta64(3600 * 24, "s"), + np.timedelta64(2, "D"), + np.timedelta64(365, "D"), + timedelta(-2), + timedelta(365), + timedelta(minutes=120), + timedelta(days=4, minutes=180), + timedelta(hours=23), + timedelta(hours=23, minutes=30), + timedelta(hours=48), + ], +) +def test_nat_addsub_tdlike_scalar(obj): + assert NaT + obj is NaT + assert obj + NaT is NaT + assert NaT - obj is NaT diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index 230a14aeec60a..7baeb8f5673bc 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -88,6 +88,14 @@ def test_td_add_datetimelike_scalar(self, op): result = op(td, NaT) assert result is NaT + def test_td_add_timestamp_overflow(self): + msg = "int too (large|big) to convert" + with pytest.raises(OverflowError, match=msg): + Timestamp("1700-01-01") + Timedelta(13 * 19999, unit="D") + + with pytest.raises(OverflowError, match=msg): + Timestamp("1700-01-01") + timedelta(days=13 * 19999) + @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_td(self, op): td = Timedelta(10, unit="d") @@ -173,14 +181,15 @@ def test_td_sub_offset(self): def test_td_add_sub_numeric_raises(self): td = Timedelta(10, unit="d") + msg = "unsupported operand type" for other in [2, 2.0, np.int64(2), np.float64(2)]: - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): td + other - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): other + td - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): td - other - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): other - td def test_td_rsub_nat(self): @@ -221,7 +230,8 @@ def test_td_rsub_mixed_most_timedeltalike_object_dtype_array(self): # GH#21980 now = Timestamp.now() arr = np.array([now, Timedelta("1D"), np.timedelta64(2, "h")]) - with pytest.raises(TypeError): + msg = r"unsupported operand type\(s\) for \-: 'Timedelta' and 'Timestamp'" + with pytest.raises(TypeError, match=msg): Timedelta("1D") - arr @pytest.mark.parametrize("op", [operator.add, ops.radd]) @@ -315,7 +325,8 @@ class TestTimedeltaMultiplicationDivision: def test_td_mul_nat(self, op, td_nat): # GH#19819 td = Timedelta(10, unit="d") - with pytest.raises(TypeError): + msg = "cannot use operands with types|Cannot multiply Timedelta with NaT" + with pytest.raises(TypeError, match=msg): op(td, td_nat) @pytest.mark.parametrize("nan", [np.nan, np.float64("NaN"), float("nan")]) @@ -342,11 +353,12 @@ def test_td_mul_scalar(self, op): assert op(-1, td).value == -1 * td.value assert op(-1.0, td).value == -1.0 * td.value - with pytest.raises(TypeError): + msg = "unsupported operand type" + with pytest.raises(TypeError, match=msg): # timedelta * datetime is gibberish op(td, Timestamp(2016, 1, 2)) - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): # invalid multiply with another timedelta op(td, td) @@ -365,6 +377,26 @@ def test_td_div_timedeltalike_scalar(self): assert np.isnan(td / NaT) + def test_td_div_td64_non_nano(self): + + # truediv + td = Timedelta("1 days 2 hours 3 ns") + result = td / np.timedelta64(1, "D") + assert result == td.value / float(86400 * 1e9) + result = td / np.timedelta64(1, "s") + assert result == td.value / float(1e9) + result = td / np.timedelta64(1, "ns") + assert result == td.value + + # floordiv + td = Timedelta("1 days 2 hours 3 ns") + result = td // np.timedelta64(1, "D") + assert result == 1 + result = td // np.timedelta64(1, "s") + assert result == 93600 + result = td // np.timedelta64(1, "ns") + assert result == td.value + def test_td_div_numeric_scalar(self): # GH#19738 td = Timedelta(10, unit="d") @@ -412,6 +444,50 @@ def test_td_rdiv_timedeltalike_scalar(self): assert np.timedelta64(60, "h") / td == 0.25 + def test_td_rdiv_na_scalar(self): + # GH#31869 None gets cast to NaT + td = Timedelta(10, unit="d") + + result = NaT / td + assert np.isnan(result) + + result = None / td + assert np.isnan(result) + + result = np.timedelta64("NaT") / td + assert np.isnan(result) + + msg = "cannot use operands with types dtype" + with pytest.raises(TypeError, match=msg): + np.datetime64("NaT") / td + + msg = "Cannot divide float by Timedelta" + with pytest.raises(TypeError, match=msg): + np.nan / td + + def test_td_rdiv_ndarray(self): + td = Timedelta(10, unit="d") + + arr = np.array([td], dtype=object) + result = arr / td + expected = np.array([1], dtype=np.float64) + tm.assert_numpy_array_equal(result, expected) + + arr = np.array([None]) + result = arr / td + expected = np.array([np.nan]) + tm.assert_numpy_array_equal(result, expected) + + arr = np.array([np.nan], dtype=object) + msg = "Cannot divide float by Timedelta" + with pytest.raises(TypeError, match=msg): + arr / td + + arr = np.array([np.nan], dtype=np.float64) + msg = "cannot use operands with types dtype" + with pytest.raises(TypeError, match=msg): + arr / td + # --------------------------------------------------------------- # Timedelta.__floordiv__ @@ -442,7 +518,13 @@ def test_td_floordiv_invalid_scalar(self): # GH#18846 td = Timedelta(hours=3, minutes=4) - with pytest.raises(TypeError): + msg = "|".join( + [ + r"Invalid dtype datetime64\[D\] for __floordiv__", + "'dtype' is an invalid keyword argument for this function", + ] + ) + with pytest.raises(TypeError, match=msg): td // np.datetime64("2016-01-01", dtype="datetime64[us]") def test_td_floordiv_numeric_scalar(self): @@ -513,7 +595,8 @@ def test_td_rfloordiv_invalid_scalar(self): td = Timedelta(hours=3, minutes=3) dt64 = np.datetime64("2016-01-01", "us") - with pytest.raises(TypeError): + msg = r"Invalid dtype datetime64\[us\] for __floordiv__" + with pytest.raises(TypeError, match=msg): td.__rfloordiv__(dt64) def test_td_rfloordiv_numeric_scalar(self): @@ -524,11 +607,12 @@ def test_td_rfloordiv_numeric_scalar(self): assert td.__rfloordiv__(3.5) is NotImplemented assert td.__rfloordiv__(2) is NotImplemented - with pytest.raises(TypeError): + msg = "Invalid dtype" + with pytest.raises(TypeError, match=msg): td.__rfloordiv__(np.float64(2.0)) - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): td.__rfloordiv__(np.uint8(9)) - with pytest.raises(TypeError, match="Invalid dtype"): + with pytest.raises(TypeError, match=msg): # deprecated GH#19761, enforced GH#29797 td.__rfloordiv__(np.int32(2.0)) @@ -549,6 +633,14 @@ def test_td_rfloordiv_timedeltalike_array(self): expected = np.array([10, np.nan]) tm.assert_numpy_array_equal(res, expected) + def test_td_rfloordiv_intarray(self): + # deprecated GH#19761, enforced GH#29797 + ints = np.array([1349654400, 1349740800, 1349827200, 1349913600]) * 10 ** 9 + + msg = "Invalid dtype" + with pytest.raises(TypeError, match=msg): + ints // Timedelta(1, unit="s") + def test_td_rfloordiv_numeric_series(self): # GH#18846 td = Timedelta(hours=3, minutes=3) @@ -556,7 +648,8 @@ def test_td_rfloordiv_numeric_series(self): res = td.__rfloordiv__(ser) assert res is NotImplemented - with pytest.raises(TypeError, match="Invalid dtype"): + msg = "Invalid dtype" + with pytest.raises(TypeError, match=msg): # Deprecated GH#19761, enforced GH#29797 # TODO: GH-19761. Change to TypeError. ser // td @@ -623,11 +716,11 @@ def test_mod_numeric(self): def test_mod_invalid(self): # GH#19365 td = Timedelta(hours=37) - - with pytest.raises(TypeError): + msg = "unsupported operand type" + with pytest.raises(TypeError, match=msg): td % Timestamp("2018-01-22") - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): td % [] def test_rmod_pytimedelta(self): @@ -649,16 +742,18 @@ def test_rmod_invalid(self): # GH#19365 td = Timedelta(minutes=3) - with pytest.raises(TypeError): + msg = "unsupported operand" + with pytest.raises(TypeError, match=msg): Timestamp("2018-01-22") % td - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): 15 % td - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): 16.0 % td - with pytest.raises(TypeError): + msg = "Invalid dtype int" + with pytest.raises(TypeError, match=msg): np.array([22, 24]) % td # ---------------------------------------------------------------- @@ -709,7 +804,8 @@ def test_divmod_invalid(self): # GH#19365 td = Timedelta(days=2, hours=6) - with pytest.raises(TypeError): + msg = r"unsupported operand type\(s\) for //: 'Timedelta' and 'Timestamp'" + with pytest.raises(TypeError, match=msg): divmod(td, Timestamp("2018-01-22")) def test_rdivmod_pytimedelta(self): @@ -728,17 +824,19 @@ def test_rdivmod_offset(self): def test_rdivmod_invalid(self): # GH#19365 td = Timedelta(minutes=3) + msg = "unsupported operand type" - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): divmod(Timestamp("2018-01-22"), td) - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): divmod(15, td) - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): divmod(16.0, td) - with pytest.raises(TypeError): + msg = "Invalid dtype int" + with pytest.raises(TypeError, match=msg): divmod(np.array([22, 24]), td) # ---------------------------------------------------------------- @@ -754,5 +852,160 @@ def test_rdivmod_invalid(self): ], ) def test_td_op_timedelta_timedeltalike_array(self, op, arr): - with pytest.raises(TypeError): + msg = "unsupported operand type|cannot use operands with types" + with pytest.raises(TypeError, match=msg): op(arr, Timedelta("1D")) + + +class TestTimedeltaComparison: + def test_compare_tick(self, tick_classes): + cls = tick_classes + + off = cls(4) + td = off.delta + assert isinstance(td, Timedelta) + + assert td == off + assert not td != off + assert td <= off + assert td >= off + assert not td < off + assert not td > off + + assert not td == 2 * off + assert td != 2 * off + assert td <= 2 * off + assert td < 2 * off + assert not td >= 2 * off + assert not td > 2 * off + + def test_comparison_object_array(self): + # analogous to GH#15183 + td = Timedelta("2 days") + other = Timedelta("3 hours") + + arr = np.array([other, td], dtype=object) + res = arr == td + expected = np.array([False, True], dtype=bool) + assert (res == expected).all() + + # 2D case + arr = np.array([[other, td], [td, other]], dtype=object) + res = arr != td + expected = np.array([[True, False], [False, True]], dtype=bool) + assert res.shape == expected.shape + assert (res == expected).all() + + def test_compare_timedelta_ndarray(self): + # GH#11835 + periods = [Timedelta("0 days 01:00:00"), Timedelta("0 days 01:00:00")] + arr = np.array(periods) + result = arr[0] > arr + expected = np.array([False, False]) + tm.assert_numpy_array_equal(result, expected) + + def test_compare_td64_ndarray(self): + # GG#33441 + arr = np.arange(5).astype("timedelta64[ns]") + td = pd.Timedelta(arr[1]) + + expected = np.array([False, True, False, False, False], dtype=bool) + + result = td == arr + tm.assert_numpy_array_equal(result, expected) + + result = arr == td + tm.assert_numpy_array_equal(result, expected) + + result = td != arr + tm.assert_numpy_array_equal(result, ~expected) + + result = arr != td + tm.assert_numpy_array_equal(result, ~expected) + + @pytest.mark.skip(reason="GH#20829 is reverted until after 0.24.0") + def test_compare_custom_object(self): + """ + Make sure non supported operations on Timedelta returns NonImplemented + and yields to other operand (GH#20829). + """ + + class CustomClass: + def __init__(self, cmp_result=None): + self.cmp_result = cmp_result + + def generic_result(self): + if self.cmp_result is None: + return NotImplemented + else: + return self.cmp_result + + def __eq__(self, other): + return self.generic_result() + + def __gt__(self, other): + return self.generic_result() + + t = Timedelta("1s") + + assert not (t == "string") + assert not (t == 1) + assert not (t == CustomClass()) + assert not (t == CustomClass(cmp_result=False)) + + assert t < CustomClass(cmp_result=True) + assert not (t < CustomClass(cmp_result=False)) + + assert t == CustomClass(cmp_result=True) + + @pytest.mark.parametrize("val", ["string", 1]) + def test_compare_unknown_type(self, val): + # GH#20829 + t = Timedelta("1s") + msg = "not supported between instances of 'Timedelta' and '(int|str)'" + with pytest.raises(TypeError, match=msg): + t >= val + with pytest.raises(TypeError, match=msg): + t > val + with pytest.raises(TypeError, match=msg): + t <= val + with pytest.raises(TypeError, match=msg): + t < val + + +def test_ops_notimplemented(): + class Other: + pass + + other = Other() + + td = Timedelta("1 day") + assert td.__add__(other) is NotImplemented + assert td.__sub__(other) is NotImplemented + assert td.__truediv__(other) is NotImplemented + assert td.__mul__(other) is NotImplemented + assert td.__floordiv__(other) is NotImplemented + + +def test_ops_error_str(): + # GH#13624 + td = Timedelta("1 day") + + for left, right in [(td, "a"), ("a", td)]: + + msg = "|".join( + [ + "unsupported operand type", + r'can only concatenate str \(not "Timedelta"\) to str', + "must be str, not Timedelta", + ] + ) + with pytest.raises(TypeError, match=msg): + left + right + + msg = "not supported between instances of" + with pytest.raises(TypeError, match=msg): + left > right + + assert not left == right + assert left != right diff --git a/pandas/tests/scalar/timedelta/test_constructors.py b/pandas/tests/scalar/timedelta/test_constructors.py index d32d1994cac74..c58994d738562 100644 --- a/pandas/tests/scalar/timedelta/test_constructors.py +++ b/pandas/tests/scalar/timedelta/test_constructors.py @@ -51,6 +51,7 @@ def test_construction(): assert Timedelta("1 milli") == timedelta(milliseconds=1) assert Timedelta("1 millisecond") == timedelta(milliseconds=1) assert Timedelta("1 us") == timedelta(microseconds=1) + assert Timedelta("1 µs") == timedelta(microseconds=1) assert Timedelta("1 micros") == timedelta(microseconds=1) assert Timedelta("1 microsecond") == timedelta(microseconds=1) assert Timedelta("1.5 microsecond") == Timedelta("00:00:00.000001500") @@ -79,22 +80,26 @@ def test_construction(): # Currently invalid as it has a - on the hh:mm:dd part # (only allowed on the days) - with pytest.raises(ValueError): + msg = "only leading negative signs are allowed" + with pytest.raises(ValueError, match=msg): Timedelta("-10 days -1 h 1.5m 1s 3us") # only leading neg signs are allowed - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Timedelta("10 days -1 h 1.5m 1s 3us") # no units specified - with pytest.raises(ValueError): + msg = "no units specified" + with pytest.raises(ValueError, match=msg): Timedelta("3.1415") # invalid construction - with pytest.raises(ValueError, match="cannot construct a Timedelta"): + msg = "cannot construct a Timedelta" + with pytest.raises(ValueError, match=msg): Timedelta() - with pytest.raises(ValueError, match="unit abbreviation w/o a number"): + msg = "unit abbreviation w/o a number" + with pytest.raises(ValueError, match=msg): Timedelta("foo") msg = ( @@ -121,7 +126,8 @@ def test_construction(): assert result == expected assert to_timedelta(offsets.Hour(2)) == Timedelta("0 days, 02:00:00") - with pytest.raises(ValueError): + msg = "unit abbreviation w/o a number" + with pytest.raises(ValueError, match=msg): Timedelta("foo bar") @@ -170,23 +176,24 @@ def test_td_from_repr_roundtrip(val): td = Timedelta(val) assert Timedelta(td.value) == td - # str does not normally display nanos - if not td.nanoseconds: - assert Timedelta(str(td)) == td + assert Timedelta(str(td)) == td assert Timedelta(td._repr_base(format="all")) == td + assert Timedelta(td._repr_base()) == td def test_overflow_on_construction(): + msg = "int too (large|big) to convert" + # GH#3374 value = Timedelta("1day").value * 20169940 - with pytest.raises(OverflowError): + with pytest.raises(OverflowError, match=msg): Timedelta(value) # xref GH#17637 - with pytest.raises(OverflowError): + with pytest.raises(OverflowError, match=msg): Timedelta(7 * 19999, unit="D") - with pytest.raises(OverflowError): + with pytest.raises(OverflowError, match=msg): Timedelta(timedelta(days=13 * 19999)) @@ -272,7 +279,8 @@ def test_td_constructor_on_nanoseconds(constructed_td, conversion): def test_td_constructor_value_error(): - with pytest.raises(TypeError): + msg = "Invalid type . Must be int or float." + with pytest.raises(TypeError, match=msg): Timedelta(nanoseconds="abc") diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 9cdbeb6ab4845..38e77321418d1 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -4,56 +4,14 @@ import numpy as np import pytest -from pandas._libs.tslibs import NaT, Timestamp, iNaT +from pandas._libs.tslibs import NaT, iNaT import pandas as pd -from pandas import Series, Timedelta, TimedeltaIndex, timedelta_range, to_timedelta +from pandas import Timedelta, TimedeltaIndex, offsets, to_timedelta import pandas._testing as tm -class TestTimedeltaArithmetic: - def test_arithmetic_overflow(self): - with pytest.raises(OverflowError): - Timestamp("1700-01-01") + Timedelta(13 * 19999, unit="D") - - with pytest.raises(OverflowError): - Timestamp("1700-01-01") + timedelta(days=13 * 19999) - - def test_array_timedelta_floordiv(self): - # deprecated GH#19761, enforced GH#29797 - ints = pd.date_range("2012-10-08", periods=4, freq="D").view("i8") - - with pytest.raises(TypeError, match="Invalid dtype"): - ints // Timedelta(1, unit="s") - - def test_ops_error_str(self): - # GH 13624 - td = Timedelta("1 day") - - for left, right in [(td, "a"), ("a", td)]: - - with pytest.raises(TypeError): - left + right - - with pytest.raises(TypeError): - left > right - - assert not left == right - assert left != right - - def test_ops_notimplemented(self): - class Other: - pass - - other = Other() - - td = Timedelta("1 day") - assert td.__add__(other) is NotImplemented - assert td.__sub__(other) is NotImplemented - assert td.__truediv__(other) is NotImplemented - assert td.__mul__(other) is NotImplemented - assert td.__floordiv__(other) is NotImplemented - +class TestTimedeltaUnaryOps: def test_unary_ops(self): td = Timedelta(10, unit="d") @@ -68,102 +26,6 @@ def test_unary_ops(self): assert abs(-td) == Timedelta("10d") -class TestTimedeltaComparison: - def test_compare_tick(self, tick_classes): - cls = tick_classes - - off = cls(4) - td = off.delta - assert isinstance(td, Timedelta) - - assert td == off - assert not td != off - assert td <= off - assert td >= off - assert not td < off - assert not td > off - - assert not td == 2 * off - assert td != 2 * off - assert td <= 2 * off - assert td < 2 * off - assert not td >= 2 * off - assert not td > 2 * off - - def test_comparison_object_array(self): - # analogous to GH#15183 - td = Timedelta("2 days") - other = Timedelta("3 hours") - - arr = np.array([other, td], dtype=object) - res = arr == td - expected = np.array([False, True], dtype=bool) - assert (res == expected).all() - - # 2D case - arr = np.array([[other, td], [td, other]], dtype=object) - res = arr != td - expected = np.array([[True, False], [False, True]], dtype=bool) - assert res.shape == expected.shape - assert (res == expected).all() - - def test_compare_timedelta_ndarray(self): - # GH11835 - periods = [Timedelta("0 days 01:00:00"), Timedelta("0 days 01:00:00")] - arr = np.array(periods) - result = arr[0] > arr - expected = np.array([False, False]) - tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.skip(reason="GH#20829 is reverted until after 0.24.0") - def test_compare_custom_object(self): - """ - Make sure non supported operations on Timedelta returns NonImplemented - and yields to other operand (GH#20829). - """ - - class CustomClass: - def __init__(self, cmp_result=None): - self.cmp_result = cmp_result - - def generic_result(self): - if self.cmp_result is None: - return NotImplemented - else: - return self.cmp_result - - def __eq__(self, other): - return self.generic_result() - - def __gt__(self, other): - return self.generic_result() - - t = Timedelta("1s") - - assert not (t == "string") - assert not (t == 1) - assert not (t == CustomClass()) - assert not (t == CustomClass(cmp_result=False)) - - assert t < CustomClass(cmp_result=True) - assert not (t < CustomClass(cmp_result=False)) - - assert t == CustomClass(cmp_result=True) - - @pytest.mark.parametrize("val", ["string", 1]) - def test_compare_unknown_type(self, val): - # GH20829 - t = Timedelta("1s") - with pytest.raises(TypeError): - t >= val - with pytest.raises(TypeError): - t > val - with pytest.raises(TypeError): - t <= val - with pytest.raises(TypeError): - t < val - - class TestTimedeltas: @pytest.mark.parametrize( "unit, value, expected", @@ -209,26 +71,6 @@ def test_conversion(self): td = Timedelta("1 days, 10:11:12.012345678") assert td != td.to_pytimedelta() - def test_freq_conversion(self): - - # truediv - td = Timedelta("1 days 2 hours 3 ns") - result = td / np.timedelta64(1, "D") - assert result == td.value / float(86400 * 1e9) - result = td / np.timedelta64(1, "s") - assert result == td.value / float(1e9) - result = td / np.timedelta64(1, "ns") - assert result == td.value - - # floordiv - td = Timedelta("1 days 2 hours 3 ns") - result = td // np.timedelta64(1, "D") - assert result == 1 - result = td // np.timedelta64(1, "s") - assert result == 93600 - result = td // np.timedelta64(1, "ns") - assert result == td.value - def test_fields(self): def check(value): # that we are int @@ -457,13 +299,15 @@ def test_to_numpy_alias(self): td = Timedelta("10m7s") assert td.to_timedelta64() == td.to_numpy() - def test_round(self): - - t1 = Timedelta("1 days 02:34:56.789123456") - t2 = Timedelta("-1 days 02:34:56.789123456") - - for (freq, s1, s2) in [ - ("N", t1, t2), + @pytest.mark.parametrize( + "freq,s1,s2", + [ + # This first case has s1, s2 being the same as t1,t2 below + ( + "N", + Timedelta("1 days 02:34:56.789123456"), + Timedelta("-1 days 02:34:56.789123456"), + ), ( "U", Timedelta("1 days 02:34:56.789123000"), @@ -481,75 +325,21 @@ def test_round(self): ("12T", Timedelta("1 days 02:36:00"), Timedelta("-1 days 02:36:00")), ("H", Timedelta("1 days 03:00:00"), Timedelta("-1 days 03:00:00")), ("d", Timedelta("1 days"), Timedelta("-1 days")), - ]: - r1 = t1.round(freq) - assert r1 == s1 - r2 = t2.round(freq) - assert r2 == s2 - - # invalid - for freq, msg in [ - ("Y", " is a non-fixed frequency"), - ("M", " is a non-fixed frequency"), - ("foobar", "Invalid frequency: foobar"), - ]: - with pytest.raises(ValueError, match=msg): - t1.round(freq) + ], + ) + def test_round(self, freq, s1, s2): - t1 = timedelta_range("1 days", periods=3, freq="1 min 2 s 3 us") - t2 = -1 * t1 - t1a = timedelta_range("1 days", periods=3, freq="1 min 2 s") - t1c = TimedeltaIndex([1, 1, 1], unit="D") + t1 = Timedelta("1 days 02:34:56.789123456") + t2 = Timedelta("-1 days 02:34:56.789123456") - # note that negative times round DOWN! so don't give whole numbers - for (freq, s1, s2) in [ - ("N", t1, t2), - ("U", t1, t2), - ( - "L", - t1a, - TimedeltaIndex( - ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"], - dtype="timedelta64[ns]", - freq=None, - ), - ), - ( - "S", - t1a, - TimedeltaIndex( - ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"], - dtype="timedelta64[ns]", - freq=None, - ), - ), - ( - "12T", - t1c, - TimedeltaIndex( - ["-1 days", "-1 days", "-1 days"], - dtype="timedelta64[ns]", - freq=None, - ), - ), - ( - "H", - t1c, - TimedeltaIndex( - ["-1 days", "-1 days", "-1 days"], - dtype="timedelta64[ns]", - freq=None, - ), - ), - ("d", t1c, TimedeltaIndex([-1, -1, -1], unit="D")), - ]: + r1 = t1.round(freq) + assert r1 == s1 + r2 = t2.round(freq) + assert r2 == s2 - r1 = t1.round(freq) - tm.assert_index_equal(r1, s1) - r2 = t2.round(freq) - tm.assert_index_equal(r2, s2) + def test_round_invalid(self): + t1 = Timedelta("1 days 02:34:56.789123456") - # invalid for freq, msg in [ ("Y", " is a non-fixed frequency"), ("M", " is a non-fixed frequency"), @@ -561,7 +351,7 @@ def test_round(self): def test_contains(self): # Checking for any NaT-like objects # GH 13603 - td = to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) + td = to_timedelta(range(5), unit="d") + offsets.Hour(1) for v in [NaT, None, float("nan"), np.nan]: assert not (v in td) @@ -618,9 +408,11 @@ def conv(v): assert Timedelta(" - 10000D ") == -conv(np.timedelta64(10000, "D")) # invalid - with pytest.raises(ValueError): + msg = "invalid unit abbreviation" + with pytest.raises(ValueError, match=msg): Timedelta("1foo") - with pytest.raises(ValueError): + msg = "unit abbreviation w/o a number" + with pytest.raises(ValueError, match=msg): Timedelta("foo") def test_full_format_converters(self): @@ -649,31 +441,9 @@ def conv(v): ) # invalid - with pytest.raises(ValueError): - Timedelta("- 1days, 00") - - def test_overflow(self): - # GH 9442 - s = Series(pd.date_range("20130101", periods=100000, freq="H")) - s[0] += Timedelta("1s 1ms") - - # mean - result = (s - s.min()).mean() - expected = Timedelta((TimedeltaIndex((s - s.min())).asi8 / len(s)).sum()) - - # the computation is converted to float so - # might be some loss of precision - assert np.allclose(result.value / 1000, expected.value / 1000) - - # sum - msg = "overflow in timedelta operation" + msg = "have leftover units" with pytest.raises(ValueError, match=msg): - (s - s.min()).sum() - s1 = s[0:10000] - with pytest.raises(ValueError, match=msg): - (s1 - s1.min()).sum() - s2 = s[0:1000] - result = (s2 - s2.min()).sum() + Timedelta("- 1days, 00") def test_pickle(self): @@ -690,7 +460,7 @@ def test_timedelta_hash_equality(self): d = {td: 2} assert d[v] == 2 - tds = timedelta_range("1 second", periods=20) + tds = [Timedelta(seconds=1) + Timedelta(days=n) for n in range(20)] assert all(hash(td) == hash(td.to_pytimedelta()) for td in tds) # python timedeltas drop ns resolution @@ -709,20 +479,21 @@ def test_implementation_limits(self): # Beyond lower limit, a NAT before the Overflow assert (min_td - Timedelta(1, "ns")) is NaT - with pytest.raises(OverflowError): + msg = "int too (large|big) to convert" + with pytest.raises(OverflowError, match=msg): min_td - Timedelta(2, "ns") - with pytest.raises(OverflowError): + with pytest.raises(OverflowError, match=msg): max_td + Timedelta(1, "ns") # Same tests using the internal nanosecond values td = Timedelta(min_td.value - 1, "ns") assert td is NaT - with pytest.raises(OverflowError): + with pytest.raises(OverflowError, match=msg): Timedelta(min_td.value - 2, "ns") - with pytest.raises(OverflowError): + with pytest.raises(OverflowError, match=msg): Timedelta(max_td.value + 1, "ns") def test_total_seconds_precision(self): @@ -734,57 +505,6 @@ def test_total_seconds_precision(self): assert (Timedelta("30S").total_seconds() - 30.0) < 1e-20 assert (30.0 - Timedelta("30S").total_seconds()) < 1e-20 - def test_timedelta_arithmetic(self): - data = Series(["nat", "32 days"], dtype="timedelta64[ns]") - deltas = [timedelta(days=1), Timedelta(1, unit="D")] - for delta in deltas: - result_method = data.add(delta) - result_operator = data + delta - expected = Series(["nat", "33 days"], dtype="timedelta64[ns]") - tm.assert_series_equal(result_operator, expected) - tm.assert_series_equal(result_method, expected) - - result_method = data.sub(delta) - result_operator = data - delta - expected = Series(["nat", "31 days"], dtype="timedelta64[ns]") - tm.assert_series_equal(result_operator, expected) - tm.assert_series_equal(result_method, expected) - # GH 9396 - result_method = data.div(delta) - result_operator = data / delta - expected = Series([np.nan, 32.0], dtype="float64") - tm.assert_series_equal(result_operator, expected) - tm.assert_series_equal(result_method, expected) - - def test_apply_to_timedelta(self): - timedelta_NaT = to_timedelta("NaT") - - list_of_valid_strings = ["00:00:01", "00:00:02"] - a = to_timedelta(list_of_valid_strings) - b = Series(list_of_valid_strings).apply(to_timedelta) - # Can't compare until apply on a Series gives the correct dtype - # assert_series_equal(a, b) - - list_of_strings = ["00:00:01", np.nan, NaT, timedelta_NaT] - - # TODO: unused? - a = to_timedelta(list_of_strings) # noqa - b = Series(list_of_strings).apply(to_timedelta) # noqa - # Can't compare until apply on a Series gives the correct dtype - # assert_series_equal(a, b) - - def test_components(self): - rng = timedelta_range("1 days, 10:11:12", periods=2, freq="s") - rng.components - - # with nat - s = Series(rng) - s[1] = np.nan - - result = s.dt.components - assert not result.iloc[0].isna().all() - assert result.iloc[1].isna().all() - def test_resolution_string(self): assert Timedelta(days=1).resolution_string == "D" assert Timedelta(days=1, hours=6).resolution_string == "H" diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index 1cab007c20a0e..b038ee1aee106 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -3,7 +3,10 @@ import numpy as np import pytest +from pandas.errors import OutOfBoundsDatetime + from pandas import Timedelta, Timestamp +import pandas._testing as tm from pandas.tseries import offsets from pandas.tseries.frequencies import to_offset @@ -60,6 +63,18 @@ def test_overflow_offset_raises(self): with pytest.raises(OverflowError, match=msg): stamp - offset_overflow + def test_overflow_timestamp_raises(self): + # https://github.com/pandas-dev/pandas/issues/31774 + msg = "Result is too large" + a = Timestamp("2101-01-01 00:00:00") + b = Timestamp("1688-01-01 00:00:00") + + with pytest.raises(OutOfBoundsDatetime, match=msg): + a - b + + # but we're OK for timestamp and datetime.datetime + assert (a - b.to_pydatetime()) == (a.to_pydatetime() - b) + def test_delta_preserve_nanos(self): val = Timestamp(1337299200000000123) result = val + timedelta(1) @@ -76,7 +91,8 @@ def test_rsub_dtscalars(self, tz_naive_fixture): if tz_naive_fixture is None: assert other.to_datetime64() - ts == td else: - with pytest.raises(TypeError, match="subtraction must have"): + msg = "subtraction must have" + with pytest.raises(TypeError, match=msg): other.to_datetime64() - ts def test_timestamp_sub_datetime(self): @@ -162,28 +178,6 @@ def test_timestamp_add_timedelta64_unit(self, other, expected_difference): valdiff = result.value - ts.value assert valdiff == expected_difference - @pytest.mark.parametrize("ts", [Timestamp.now(), Timestamp.now("utc")]) - @pytest.mark.parametrize( - "other", - [ - 1, - np.int64(1), - np.array([1, 2], dtype=np.int32), - np.array([3, 4], dtype=np.uint64), - ], - ) - def test_add_int_no_freq_raises(self, ts, other): - msg = "Addition/subtraction of integers and integer-arrays" - with pytest.raises(TypeError, match=msg): - ts + other - with pytest.raises(TypeError, match=msg): - other + ts - - with pytest.raises(TypeError, match=msg): - ts - other - with pytest.raises(TypeError): - other - ts - @pytest.mark.parametrize( "ts", [ @@ -201,14 +195,64 @@ def test_add_int_no_freq_raises(self, ts, other): ], ) def test_add_int_with_freq(self, ts, other): - - with pytest.raises(TypeError): + msg = "Addition/subtraction of integers and integer-arrays" + with pytest.raises(TypeError, match=msg): ts + other - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): other + ts - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): ts - other - with pytest.raises(TypeError): + msg = "unsupported operand type" + with pytest.raises(TypeError, match=msg): + other - ts + + @pytest.mark.parametrize("shape", [(6,), (2, 3,)]) + def test_addsub_m8ndarray(self, shape): + # GH#33296 + ts = Timestamp("2020-04-04 15:45") + other = np.arange(6).astype("m8[h]").reshape(shape) + + result = ts + other + + ex_stamps = [ts + Timedelta(hours=n) for n in range(6)] + expected = np.array([x.asm8 for x in ex_stamps], dtype="M8[ns]").reshape(shape) + tm.assert_numpy_array_equal(result, expected) + + result = other + ts + tm.assert_numpy_array_equal(result, expected) + + result = ts - other + ex_stamps = [ts - Timedelta(hours=n) for n in range(6)] + expected = np.array([x.asm8 for x in ex_stamps], dtype="M8[ns]").reshape(shape) + tm.assert_numpy_array_equal(result, expected) + + msg = r"unsupported operand type\(s\) for -: 'numpy.ndarray' and 'Timestamp'" + with pytest.raises(TypeError, match=msg): + other - ts + + @pytest.mark.parametrize("shape", [(6,), (2, 3,)]) + def test_addsub_m8ndarray_tzaware(self, shape): + # GH#33296 + ts = Timestamp("2020-04-04 15:45", tz="US/Pacific") + + other = np.arange(6).astype("m8[h]").reshape(shape) + + result = ts + other + + ex_stamps = [ts + Timedelta(hours=n) for n in range(6)] + expected = np.array(ex_stamps).reshape(shape) + tm.assert_numpy_array_equal(result, expected) + + result = other + ts + tm.assert_numpy_array_equal(result, expected) + + result = ts - other + ex_stamps = [ts - Timedelta(hours=n) for n in range(6)] + expected = np.array(ex_stamps).reshape(shape) + tm.assert_numpy_array_equal(result, expected) + + msg = r"unsupported operand type\(s\) for -: 'numpy.ndarray' and 'Timestamp'" + with pytest.raises(TypeError, match=msg): other - ts diff --git a/pandas/tests/scalar/timestamp/test_comparisons.py b/pandas/tests/scalar/timestamp/test_comparisons.py index fce4fa6eb1eaa..27aef8c4a9eb7 100644 --- a/pandas/tests/scalar/timestamp/test_comparisons.py +++ b/pandas/tests/scalar/timestamp/test_comparisons.py @@ -5,9 +5,61 @@ import pytest from pandas import Timestamp +import pandas._testing as tm class TestTimestampComparison: + def test_comparison_dt64_ndarray(self): + ts = Timestamp.now() + ts2 = Timestamp("2019-04-05") + arr = np.array([[ts.asm8, ts2.asm8]], dtype="M8[ns]") + + result = ts == arr + expected = np.array([[True, False]], dtype=bool) + tm.assert_numpy_array_equal(result, expected) + + result = arr == ts + tm.assert_numpy_array_equal(result, expected) + + result = ts != arr + tm.assert_numpy_array_equal(result, ~expected) + + result = arr != ts + tm.assert_numpy_array_equal(result, ~expected) + + result = ts2 < arr + tm.assert_numpy_array_equal(result, expected) + + result = arr < ts2 + tm.assert_numpy_array_equal(result, np.array([[False, False]], dtype=bool)) + + result = ts2 <= arr + tm.assert_numpy_array_equal(result, np.array([[True, True]], dtype=bool)) + + result = arr <= ts2 + tm.assert_numpy_array_equal(result, ~expected) + + result = ts >= arr + tm.assert_numpy_array_equal(result, np.array([[True, True]], dtype=bool)) + + result = arr >= ts + tm.assert_numpy_array_equal(result, np.array([[True, False]], dtype=bool)) + + @pytest.mark.parametrize("reverse", [True, False]) + def test_comparison_dt64_ndarray_tzaware(self, reverse, all_compare_operators): + op = getattr(operator, all_compare_operators.strip("__")) + + ts = Timestamp.now("UTC") + arr = np.array([ts.asm8, ts.asm8], dtype="M8[ns]") + + left, right = ts, arr + if reverse: + left, right = arr, ts + + msg = "Cannot compare tz-naive and tz-aware timestamps" + with pytest.raises(TypeError, match=msg): + op(left, right) + def test_comparison_object_array(self): # GH#15183 ts = Timestamp("2011-01-03 00:00:00-0500", tz="US/Eastern") @@ -28,7 +80,8 @@ def test_comparison_object_array(self): # tzaware mismatch arr = np.array([naive], dtype=object) - with pytest.raises(TypeError): + msg = "Cannot compare tz-naive and tz-aware timestamps" + with pytest.raises(TypeError, match=msg): arr < ts def test_comparison(self): @@ -85,30 +138,31 @@ def test_cant_compare_tz_naive_w_aware(self, utc_fixture): a = Timestamp("3/12/2012") b = Timestamp("3/12/2012", tz=utc_fixture) - with pytest.raises(TypeError): + msg = "Cannot compare tz-naive and tz-aware timestamps" + with pytest.raises(TypeError, match=msg): a == b - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): a != b - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): a < b - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): a <= b - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): a > b - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): a >= b - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): b == a - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): b != a - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): b < a - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): b <= a - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): b > a - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): b >= a assert not a == b.to_pydatetime() diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index 4c75d1ebcd377..770753f42a4c8 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -165,20 +165,25 @@ def test_constructor_with_stringoffset(self): assert result == eval(repr(result)) def test_constructor_invalid(self): - with pytest.raises(TypeError, match="Cannot convert input"): + msg = "Cannot convert input" + with pytest.raises(TypeError, match=msg): Timestamp(slice(2)) - with pytest.raises(ValueError, match="Cannot convert Period"): + msg = "Cannot convert Period" + with pytest.raises(ValueError, match=msg): Timestamp(Period("1000-01-01")) def test_constructor_invalid_tz(self): # GH#17690 - with pytest.raises(TypeError, match="must be a datetime.tzinfo"): + msg = "must be a datetime.tzinfo" + with pytest.raises(TypeError, match=msg): Timestamp("2017-10-22", tzinfo="US/Eastern") - with pytest.raises(ValueError, match="at most one of"): + msg = "at most one of" + with pytest.raises(ValueError, match=msg): Timestamp("2017-10-22", tzinfo=pytz.utc, tz="UTC") - with pytest.raises(ValueError, match="Invalid frequency:"): + msg = "Invalid frequency:" + with pytest.raises(ValueError, match=msg): # GH#5168 # case where user tries to pass tz as an arg, not kwarg, gets # interpreted as a `freq` @@ -189,7 +194,8 @@ def test_constructor_strptime(self): # Test support for Timestamp.strptime fmt = "%Y%m%d-%H%M%S-%f%z" ts = "20190129-235348-000001+0000" - with pytest.raises(NotImplementedError): + msg = r"Timestamp.strptime\(\) is not implemented" + with pytest.raises(NotImplementedError, match=msg): Timestamp.strptime(ts, fmt) def test_constructor_tz_or_tzinfo(self): @@ -206,15 +212,20 @@ def test_constructor_tz_or_tzinfo(self): def test_constructor_positional(self): # see gh-10758 - with pytest.raises(TypeError): + msg = "an integer is required" + with pytest.raises(TypeError, match=msg): Timestamp(2000, 1) - with pytest.raises(ValueError): + + msg = "month must be in 1..12" + with pytest.raises(ValueError, match=msg): Timestamp(2000, 0, 1) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Timestamp(2000, 13, 1) - with pytest.raises(ValueError): + + msg = "day is out of range for month" + with pytest.raises(ValueError, match=msg): Timestamp(2000, 1, 0) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Timestamp(2000, 1, 32) # see gh-11630 @@ -225,15 +236,20 @@ def test_constructor_positional(self): def test_constructor_keyword(self): # GH 10758 - with pytest.raises(TypeError): + msg = "function missing required argument 'day'|Required argument 'day'" + with pytest.raises(TypeError, match=msg): Timestamp(year=2000, month=1) - with pytest.raises(ValueError): + + msg = "month must be in 1..12" + with pytest.raises(ValueError, match=msg): Timestamp(year=2000, month=0, day=1) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Timestamp(year=2000, month=13, day=1) - with pytest.raises(ValueError): + + msg = "day is out of range for month" + with pytest.raises(ValueError, match=msg): Timestamp(year=2000, month=1, day=0) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Timestamp(year=2000, month=1, day=32) assert repr(Timestamp(year=2015, month=11, day=12)) == repr( @@ -313,7 +329,8 @@ def test_constructor_nanosecond(self, result): @pytest.mark.parametrize("z", ["Z0", "Z00"]) def test_constructor_invalid_Z0_isostring(self, z): # GH 8910 - with pytest.raises(ValueError): + msg = "could not convert string to Timestamp" + with pytest.raises(ValueError, match=msg): Timestamp(f"2014-11-02 01:00{z}") @pytest.mark.parametrize( @@ -331,14 +348,17 @@ def test_constructor_invalid_Z0_isostring(self, z): ) def test_invalid_date_kwarg_with_string_input(self, arg): kwarg = {arg: 1} - with pytest.raises(ValueError): + msg = "Cannot pass a date attribute keyword argument" + with pytest.raises(ValueError, match=msg): Timestamp("2010-10-10 12:59:59.999999999", **kwarg) def test_out_of_bounds_integer_value(self): # GH#26651 check that we raise OutOfBoundsDatetime, not OverflowError - with pytest.raises(OutOfBoundsDatetime): + msg = str(Timestamp.max.value * 2) + with pytest.raises(OutOfBoundsDatetime, match=msg): Timestamp(Timestamp.max.value * 2) - with pytest.raises(OutOfBoundsDatetime): + msg = str(Timestamp.min.value * 2) + with pytest.raises(OutOfBoundsDatetime, match=msg): Timestamp(Timestamp.min.value * 2) def test_out_of_bounds_value(self): @@ -353,25 +373,28 @@ def test_out_of_bounds_value(self): Timestamp(min_ts_us) Timestamp(max_ts_us) + msg = "Out of bounds" # One us less than the minimum is an error - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Timestamp(min_ts_us - one_us) # One us more than the maximum is an error - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Timestamp(max_ts_us + one_us) def test_out_of_bounds_string(self): - with pytest.raises(ValueError): + msg = "Out of bounds" + with pytest.raises(ValueError, match=msg): Timestamp("1676-01-01") - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Timestamp("2263-01-01") def test_barely_out_of_bounds(self): # GH#19529 # GH#19382 close enough to bounds that dropping nanos would result # in an in-bounds datetime - with pytest.raises(OutOfBoundsDatetime): + msg = "Out of bounds nanosecond timestamp: 2262-04-11 23:47:16" + with pytest.raises(OutOfBoundsDatetime, match=msg): Timestamp("2262-04-11 23:47:16.854775808") def test_bounds_with_different_units(self): @@ -382,7 +405,8 @@ def test_bounds_with_different_units(self): for date_string in out_of_bounds_dates: for unit in time_units: dt64 = np.datetime64(date_string, unit) - with pytest.raises(ValueError): + msg = "Out of bounds" + with pytest.raises(OutOfBoundsDatetime, match=msg): Timestamp(dt64) in_bounds_dates = ("1677-09-23", "2262-04-11") @@ -449,7 +473,8 @@ def test_today(self): def test_disallow_setting_tz(self, tz): # GH 3746 ts = Timestamp("2010") - with pytest.raises(AttributeError): + msg = "Cannot directly set timezone" + with pytest.raises(AttributeError, match=msg): ts.tz = tz @pytest.mark.parametrize("offset", ["+0300", "+0200"]) @@ -476,16 +501,19 @@ def test_construct_timestamp_preserve_original_frequency(self): def test_constructor_invalid_frequency(self): # GH 22311 - with pytest.raises(ValueError, match="Invalid frequency:"): + msg = "Invalid frequency:" + with pytest.raises(ValueError, match=msg): Timestamp("2012-01-01", freq=[]) @pytest.mark.parametrize("box", [datetime, Timestamp]) def test_raise_tz_and_tzinfo_in_datetime_input(self, box): # GH 23579 kwargs = {"year": 2018, "month": 1, "day": 1, "tzinfo": pytz.utc} - with pytest.raises(ValueError, match="Cannot pass a datetime or Timestamp"): + msg = "Cannot pass a datetime or Timestamp" + with pytest.raises(ValueError, match=msg): Timestamp(box(**kwargs), tz="US/Pacific") - with pytest.raises(ValueError, match="Cannot pass a datetime or Timestamp"): + msg = "Cannot pass a datetime or Timestamp" + with pytest.raises(ValueError, match=msg): Timestamp(box(**kwargs), tzinfo=pytz.timezone("US/Pacific")) def test_dont_convert_dateutil_utc_to_pytz_utc(self): diff --git a/pandas/tests/scalar/timestamp/test_timezones.py b/pandas/tests/scalar/timestamp/test_timezones.py index cfa7da810ada1..9611c827be6fe 100644 --- a/pandas/tests/scalar/timestamp/test_timezones.py +++ b/pandas/tests/scalar/timestamp/test_timezones.py @@ -21,19 +21,20 @@ class TestTimestampTZOperations: # Timestamp.tz_localize def test_tz_localize_pushes_out_of_bounds(self): + msg = "^$" # GH#12677 # tz_localize that pushes away from the boundary is OK pac = Timestamp.min.tz_localize("US/Pacific") assert pac.value > Timestamp.min.value pac.tz_convert("Asia/Tokyo") # tz_convert doesn't change value - with pytest.raises(OutOfBoundsDatetime): + with pytest.raises(OutOfBoundsDatetime, match=msg): Timestamp.min.tz_localize("Asia/Tokyo") # tz_localize that pushes away from the boundary is OK tokyo = Timestamp.max.tz_localize("Asia/Tokyo") assert tokyo.value < Timestamp.max.value tokyo.tz_convert("US/Pacific") # tz_convert doesn't change value - with pytest.raises(OutOfBoundsDatetime): + with pytest.raises(OutOfBoundsDatetime, match=msg): Timestamp.max.tz_localize("US/Pacific") def test_tz_localize_ambiguous_bool(self): @@ -43,7 +44,8 @@ def test_tz_localize_ambiguous_bool(self): expected0 = Timestamp("2015-11-01 01:00:03-0500", tz="US/Central") expected1 = Timestamp("2015-11-01 01:00:03-0600", tz="US/Central") - with pytest.raises(pytz.AmbiguousTimeError): + msg = "Cannot infer dst time from 2015-11-01 01:00:03" + with pytest.raises(pytz.AmbiguousTimeError, match=msg): ts.tz_localize("US/Central") result = ts.tz_localize("US/Central", ambiguous=True) @@ -58,7 +60,8 @@ def test_tz_localize_ambiguous(self): ts_no_dst = ts.tz_localize("US/Eastern", ambiguous=False) assert (ts_no_dst.value - ts_dst.value) / 1e9 == 3600 - with pytest.raises(ValueError): + msg = "Cannot infer offset with only one time" + with pytest.raises(ValueError, match=msg): ts.tz_localize("US/Eastern", ambiguous="infer") # GH#8025 @@ -82,24 +85,29 @@ def test_tz_localize_ambiguous(self): def test_tz_localize_nonexistent(self, stamp, tz): # GH#13057 ts = Timestamp(stamp) - with pytest.raises(NonExistentTimeError): + with pytest.raises(NonExistentTimeError, match=stamp): ts.tz_localize(tz) # GH 22644 - with pytest.raises(NonExistentTimeError): + with pytest.raises(NonExistentTimeError, match=stamp): ts.tz_localize(tz, nonexistent="raise") assert ts.tz_localize(tz, nonexistent="NaT") is NaT def test_tz_localize_ambiguous_raise(self): # GH#13057 ts = Timestamp("2015-11-1 01:00") - with pytest.raises(AmbiguousTimeError): + msg = "Cannot infer dst time from 2015-11-01 01:00:00," + with pytest.raises(AmbiguousTimeError, match=msg): ts.tz_localize("US/Pacific", ambiguous="raise") def test_tz_localize_nonexistent_invalid_arg(self): # GH 22644 tz = "Europe/Warsaw" ts = Timestamp("2015-03-29 02:00:00") - with pytest.raises(ValueError): + msg = ( + "The nonexistent argument must be one of 'raise', 'NaT', " + "'shift_forward', 'shift_backward' or a timedelta object" + ) + with pytest.raises(ValueError, match=msg): ts.tz_localize(tz, nonexistent="foo") @pytest.mark.parametrize( @@ -117,7 +125,8 @@ def test_tz_localize_roundtrip(self, stamp, tz_aware_fixture): localized = ts.tz_localize(tz) assert localized == Timestamp(stamp, tz=tz) - with pytest.raises(TypeError): + msg = "Cannot localize tz-aware Timestamp" + with pytest.raises(TypeError, match=msg): localized.tz_localize(tz) reset = localized.tz_localize(None) @@ -249,9 +258,14 @@ def test_timestamp_tz_localize_nonexistent_NaT(self, tz): def test_timestamp_tz_localize_nonexistent_raise(self, tz): # GH 8917 ts = Timestamp("2015-03-29 02:20:00") - with pytest.raises(pytz.NonExistentTimeError): + msg = "2015-03-29 02:20:00" + with pytest.raises(pytz.NonExistentTimeError, match=msg): ts.tz_localize(tz, nonexistent="raise") - with pytest.raises(ValueError): + msg = ( + "The nonexistent argument must be one of 'raise', 'NaT', " + "'shift_forward', 'shift_backward' or a timedelta object" + ) + with pytest.raises(ValueError, match=msg): ts.tz_localize(tz, nonexistent="foo") # ------------------------------------------------------------------ @@ -327,14 +341,16 @@ def test_timestamp_constructor_near_dst_boundary(self): expected = Timestamp("2015-10-25 01:00").tz_localize(tz) assert result == expected - with pytest.raises(pytz.AmbiguousTimeError): + msg = "Cannot infer dst time from 2015-10-25 02:00:00" + with pytest.raises(pytz.AmbiguousTimeError, match=msg): Timestamp("2015-10-25 02:00", tz=tz) result = Timestamp("2017-03-26 01:00", tz="Europe/Paris") expected = Timestamp("2017-03-26 01:00").tz_localize("Europe/Paris") assert result == expected - with pytest.raises(pytz.NonExistentTimeError): + msg = "2017-03-26 02:00" + with pytest.raises(pytz.NonExistentTimeError, match=msg): Timestamp("2017-03-26 02:00", tz="Europe/Paris") # GH#11708 @@ -352,7 +368,8 @@ def test_timestamp_constructor_near_dst_boundary(self): expected = Timestamp("2017-03-26 01:00:00+0100", tz="Europe/Paris") assert result == expected - with pytest.raises(pytz.NonExistentTimeError): + msg = "2017-03-26 02:00" + with pytest.raises(pytz.NonExistentTimeError, match=msg): Timestamp("2017-03-26 02:00", tz="Europe/Paris") result = Timestamp("2017-03-26 02:00:00+0100", tz="Europe/Paris") diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index 78e795e71cd07..e657559b55d5a 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -166,7 +166,8 @@ def test_round_dst_border_ambiguous(self, method): result = getattr(ts, method)("H", ambiguous="NaT") assert result is NaT - with pytest.raises(pytz.AmbiguousTimeError): + msg = "Cannot infer dst time" + with pytest.raises(pytz.AmbiguousTimeError, match=msg): getattr(ts, method)("H", ambiguous="raise") @pytest.mark.parametrize( @@ -187,7 +188,8 @@ def test_round_dst_border_nonexistent(self, method, ts_str, freq): result = getattr(ts, method)(freq, nonexistent="NaT") assert result is NaT - with pytest.raises(pytz.NonExistentTimeError, match="2018-03-11 02:00:00"): + msg = "2018-03-11 02:00:00" + with pytest.raises(pytz.NonExistentTimeError, match=msg): getattr(ts, method)(freq, nonexistent="raise") @pytest.mark.parametrize( @@ -298,14 +300,16 @@ def test_replace_invalid_kwarg(self, tz_aware_fixture): tz = tz_aware_fixture # GH#14621, GH#7825 ts = Timestamp("2016-01-01 09:00:00.000000123", tz=tz) - with pytest.raises(TypeError): + msg = r"replace\(\) got an unexpected keyword argument" + with pytest.raises(TypeError, match=msg): ts.replace(foo=5) def test_replace_integer_args(self, tz_aware_fixture): tz = tz_aware_fixture # GH#14621, GH#7825 ts = Timestamp("2016-01-01 09:00:00.000000123", tz=tz) - with pytest.raises(ValueError): + msg = "value must be an integer, received for hour" + with pytest.raises(ValueError, match=msg): ts.replace(hour=0.1) def test_replace_tzinfo_equiv_tz_localize_none(self): diff --git a/pandas/tests/series/indexing/test_alter_index.py b/pandas/tests/series/indexing/test_alter_index.py index 05bd967903e9d..f2969e15fad8a 100644 --- a/pandas/tests/series/indexing/test_alter_index.py +++ b/pandas/tests/series/indexing/test_alter_index.py @@ -1,5 +1,3 @@ -from datetime import datetime - import numpy as np import pytest @@ -8,162 +6,6 @@ import pandas._testing as tm -@pytest.mark.parametrize( - "first_slice,second_slice", - [ - [[2, None], [None, -5]], - [[None, 0], [None, -5]], - [[None, -5], [None, 0]], - [[None, 0], [None, 0]], - ], -) -@pytest.mark.parametrize("fill", [None, -1]) -def test_align(datetime_series, first_slice, second_slice, join_type, fill): - a = datetime_series[slice(*first_slice)] - b = datetime_series[slice(*second_slice)] - - aa, ab = a.align(b, join=join_type, fill_value=fill) - - join_index = a.index.join(b.index, how=join_type) - if fill is not None: - diff_a = aa.index.difference(join_index) - diff_b = ab.index.difference(join_index) - if len(diff_a) > 0: - assert (aa.reindex(diff_a) == fill).all() - if len(diff_b) > 0: - assert (ab.reindex(diff_b) == fill).all() - - ea = a.reindex(join_index) - eb = b.reindex(join_index) - - if fill is not None: - ea = ea.fillna(fill) - eb = eb.fillna(fill) - - tm.assert_series_equal(aa, ea) - tm.assert_series_equal(ab, eb) - assert aa.name == "ts" - assert ea.name == "ts" - assert ab.name == "ts" - assert eb.name == "ts" - - -@pytest.mark.parametrize( - "first_slice,second_slice", - [ - [[2, None], [None, -5]], - [[None, 0], [None, -5]], - [[None, -5], [None, 0]], - [[None, 0], [None, 0]], - ], -) -@pytest.mark.parametrize("method", ["pad", "bfill"]) -@pytest.mark.parametrize("limit", [None, 1]) -def test_align_fill_method( - datetime_series, first_slice, second_slice, join_type, method, limit -): - a = datetime_series[slice(*first_slice)] - b = datetime_series[slice(*second_slice)] - - aa, ab = a.align(b, join=join_type, method=method, limit=limit) - - join_index = a.index.join(b.index, how=join_type) - ea = a.reindex(join_index) - eb = b.reindex(join_index) - - ea = ea.fillna(method=method, limit=limit) - eb = eb.fillna(method=method, limit=limit) - - tm.assert_series_equal(aa, ea) - tm.assert_series_equal(ab, eb) - - -def test_align_nocopy(datetime_series): - b = datetime_series[:5].copy() - - # do copy - a = datetime_series.copy() - ra, _ = a.align(b, join="left") - ra[:5] = 5 - assert not (a[:5] == 5).any() - - # do not copy - a = datetime_series.copy() - ra, _ = a.align(b, join="left", copy=False) - ra[:5] = 5 - assert (a[:5] == 5).all() - - # do copy - a = datetime_series.copy() - b = datetime_series[:5].copy() - _, rb = a.align(b, join="right") - rb[:3] = 5 - assert not (b[:3] == 5).any() - - # do not copy - a = datetime_series.copy() - b = datetime_series[:5].copy() - _, rb = a.align(b, join="right", copy=False) - rb[:2] = 5 - assert (b[:2] == 5).all() - - -def test_align_same_index(datetime_series): - a, b = datetime_series.align(datetime_series, copy=False) - assert a.index is datetime_series.index - assert b.index is datetime_series.index - - a, b = datetime_series.align(datetime_series, copy=True) - assert a.index is not datetime_series.index - assert b.index is not datetime_series.index - - -def test_align_multiindex(): - # GH 10665 - - midx = pd.MultiIndex.from_product( - [range(2), range(3), range(2)], names=("a", "b", "c") - ) - idx = pd.Index(range(2), name="b") - s1 = pd.Series(np.arange(12, dtype="int64"), index=midx) - s2 = pd.Series(np.arange(2, dtype="int64"), index=idx) - - # these must be the same results (but flipped) - res1l, res1r = s1.align(s2, join="left") - res2l, res2r = s2.align(s1, join="right") - - expl = s1 - tm.assert_series_equal(expl, res1l) - tm.assert_series_equal(expl, res2r) - expr = pd.Series([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx) - tm.assert_series_equal(expr, res1r) - tm.assert_series_equal(expr, res2l) - - res1l, res1r = s1.align(s2, join="right") - res2l, res2r = s2.align(s1, join="left") - - exp_idx = pd.MultiIndex.from_product( - [range(2), range(2), range(2)], names=("a", "b", "c") - ) - expl = pd.Series([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) - tm.assert_series_equal(expl, res1l) - tm.assert_series_equal(expl, res2r) - expr = pd.Series([0, 0, 1, 1] * 2, index=exp_idx) - tm.assert_series_equal(expr, res1r) - tm.assert_series_equal(expr, res2l) - - -@pytest.mark.parametrize("method", ["backfill", "bfill", "pad", "ffill", None]) -def test_align_method(method): - # GH31788 - ser = pd.Series(range(3), index=range(3)) - df = pd.DataFrame(0.0, index=range(3), columns=range(3)) - - result_ser, result_df = ser.align(df, method=method) - tm.assert_series_equal(result_ser, ser) - tm.assert_frame_equal(result_df, df) - - def test_reindex(datetime_series, string_series): identity = string_series.reindex(string_series.index) @@ -305,25 +147,17 @@ def test_reindex_pad(): def test_reindex_nearest(): s = Series(np.arange(10, dtype="int64")) target = [0.1, 0.9, 1.5, 2.0] - actual = s.reindex(target, method="nearest") + result = s.reindex(target, method="nearest") expected = Series(np.around(target).astype("int64"), target) - tm.assert_series_equal(expected, actual) + tm.assert_series_equal(expected, result) - actual = s.reindex_like(actual, method="nearest") - tm.assert_series_equal(expected, actual) - - actual = s.reindex_like(actual, method="nearest", tolerance=1) - tm.assert_series_equal(expected, actual) - actual = s.reindex_like(actual, method="nearest", tolerance=[1, 2, 3, 4]) - tm.assert_series_equal(expected, actual) - - actual = s.reindex(target, method="nearest", tolerance=0.2) + result = s.reindex(target, method="nearest", tolerance=0.2) expected = Series([0, 1, np.nan, 2], target) - tm.assert_series_equal(expected, actual) + tm.assert_series_equal(expected, result) - actual = s.reindex(target, method="nearest", tolerance=[0.3, 0.01, 0.4, 3]) + result = s.reindex(target, method="nearest", tolerance=[0.3, 0.01, 0.4, 3]) expected = Series([0, np.nan, np.nan, 2], target) - tm.assert_series_equal(expected, actual) + tm.assert_series_equal(expected, result) def test_reindex_backfill(): @@ -393,25 +227,6 @@ def test_reindex_categorical(): tm.assert_series_equal(result, expected) -def test_reindex_like(datetime_series): - other = datetime_series[::2] - tm.assert_series_equal( - datetime_series.reindex(other.index), datetime_series.reindex_like(other) - ) - - # GH 7179 - day1 = datetime(2013, 3, 5) - day2 = datetime(2013, 5, 5) - day3 = datetime(2014, 3, 5) - - series1 = Series([5, None, None], [day1, day2, day3]) - series2 = Series([None, None], [day1, day3]) - - result = series1.reindex_like(series2, method="pad") - expected = Series([5, np.nan], index=[day1, day3]) - tm.assert_series_equal(result, expected) - - def test_reindex_fill_value(): # ----------------------------------------------------------- # floats @@ -477,94 +292,3 @@ def test_reindex_empty_series_tz_dtype(): result = Series(dtype="datetime64[ns, UTC]").reindex([0, 1]) expected = Series([pd.NaT] * 2, dtype="datetime64[ns, UTC]") tm.assert_equal(result, expected) - - -def test_rename(): - # GH 17407 - s = Series(range(1, 6), index=pd.Index(range(2, 7), name="IntIndex")) - result = s.rename(str) - expected = s.rename(lambda i: str(i)) - tm.assert_series_equal(result, expected) - - assert result.name == expected.name - - -@pytest.mark.parametrize( - "data, index, drop_labels, axis, expected_data, expected_index", - [ - # Unique Index - ([1, 2], ["one", "two"], ["two"], 0, [1], ["one"]), - ([1, 2], ["one", "two"], ["two"], "rows", [1], ["one"]), - ([1, 1, 2], ["one", "two", "one"], ["two"], 0, [1, 2], ["one", "one"]), - # GH 5248 Non-Unique Index - ([1, 1, 2], ["one", "two", "one"], "two", 0, [1, 2], ["one", "one"]), - ([1, 1, 2], ["one", "two", "one"], ["one"], 0, [1], ["two"]), - ([1, 1, 2], ["one", "two", "one"], "one", 0, [1], ["two"]), - ], -) -def test_drop_unique_and_non_unique_index( - data, index, axis, drop_labels, expected_data, expected_index -): - - s = Series(data=data, index=index) - result = s.drop(drop_labels, axis=axis) - expected = Series(data=expected_data, index=expected_index) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "data, index, drop_labels, axis, error_type, error_desc", - [ - # single string/tuple-like - (range(3), list("abc"), "bc", 0, KeyError, "not found in axis"), - # bad axis - (range(3), list("abc"), ("a",), 0, KeyError, "not found in axis"), - (range(3), list("abc"), "one", "columns", ValueError, "No axis named columns"), - ], -) -def test_drop_exception_raised(data, index, drop_labels, axis, error_type, error_desc): - - with pytest.raises(error_type, match=error_desc): - Series(data, index=index).drop(drop_labels, axis=axis) - - -def test_drop_with_ignore_errors(): - # errors='ignore' - s = Series(range(3), index=list("abc")) - result = s.drop("bc", errors="ignore") - tm.assert_series_equal(result, s) - result = s.drop(["a", "d"], errors="ignore") - expected = s.iloc[1:] - tm.assert_series_equal(result, expected) - - # GH 8522 - s = Series([2, 3], index=[True, False]) - assert s.index.is_object() - result = s.drop(True) - expected = Series([3], index=[False]) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("index", [[1, 2, 3], [1, 1, 3]]) -@pytest.mark.parametrize("drop_labels", [[], [1], [3]]) -def test_drop_empty_list(index, drop_labels): - # GH 21494 - expected_index = [i for i in index if i not in drop_labels] - series = pd.Series(index=index, dtype=object).drop(drop_labels) - expected = pd.Series(index=expected_index, dtype=object) - tm.assert_series_equal(series, expected) - - -@pytest.mark.parametrize( - "data, index, drop_labels", - [ - (None, [1, 2, 3], [1, 4]), - (None, [1, 2, 2], [1, 4]), - ([2, 3], [0, 1], [False, True]), - ], -) -def test_drop_non_empty_list(data, index, drop_labels): - # GH 21494 and GH 16877 - with pytest.raises(KeyError, match="not found in axis"): - dtype = object if data is None else None - pd.Series(data=data, index=index, dtype=dtype).drop(drop_labels) diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index fc9d4ec5290a5..e369631a5565d 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -130,18 +130,6 @@ def test_slicing_datetimes(): tm.assert_frame_equal(result, expected) -def test_frame_datetime64_duplicated(): - dates = date_range("2010-07-01", end="2010-08-05") - - tst = DataFrame({"symbol": "AAA", "date": dates}) - result = tst.duplicated(["date", "symbol"]) - assert (-result).all() - - tst = DataFrame({"date": dates}) - result = tst.duplicated() - assert (-result).all() - - def test_getitem_setitem_datetime_tz_pytz(): from pytz import timezone as tz @@ -293,7 +281,7 @@ def test_getitem_setitem_datetimeindex(): result = ts.copy() result[ts.index[4:8]] = 0 - result[4:8] = ts[4:8] + result.iloc[4:8] = ts.iloc[4:8] tm.assert_series_equal(result, ts) # also test partial date slicing @@ -349,24 +337,10 @@ def test_getitem_setitem_periodindex(): result = ts.copy() result[ts.index[4:8]] = 0 - result[4:8] = ts[4:8] + result.iloc[4:8] = ts.iloc[4:8] tm.assert_series_equal(result, ts) -# FutureWarning from NumPy. -@pytest.mark.filterwarnings("ignore:Using a non-tuple:FutureWarning") -def test_getitem_median_slice_bug(): - index = date_range("20090415", "20090519", freq="2B") - s = Series(np.random.randn(13), index=index) - - indexer = [slice(6, 7, None)] - with tm.assert_produces_warning(FutureWarning): - # GH#31299 - result = s[indexer] - expected = s[indexer[0]] - tm.assert_series_equal(result, expected) - - def test_datetime_indexing(): index = date_range("1/1/2000", "1/7/2000") @@ -464,12 +438,6 @@ def test_index_unique(dups): assert idx.nunique(dropna=False) == 21 -def test_index_dupes_contains(): - d = datetime(2011, 12, 5, 20, 30) - ix = DatetimeIndex([d, d]) - assert d in ix - - def test_duplicate_dates_indexing(dups): ts = dups @@ -690,30 +658,6 @@ def test_indexing(): """ -def test_set_none_nan(): - series = Series(date_range("1/1/2000", periods=10)) - series[3] = None - assert series[3] is NaT - - series[3:5] = None - assert series[4] is NaT - - series[5] = np.nan - assert series[5] is NaT - - series[5:7] = np.nan - assert series[6] is NaT - - -def test_nat_operations(): - # GH 8617 - s = Series([0, pd.NaT], dtype="m8[ns]") - exp = s[0] - assert s.median() == exp - assert s.min() == exp - assert s.max() == exp - - def test_setitem_tuple_with_datetimetz(): # GH 20441 arr = date_range("2017", periods=4, tz="US/Eastern") diff --git a/pandas/tests/series/indexing/test_delitem.py b/pandas/tests/series/indexing/test_delitem.py new file mode 100644 index 0000000000000..6c7e3f2b06983 --- /dev/null +++ b/pandas/tests/series/indexing/test_delitem.py @@ -0,0 +1,49 @@ +import pytest + +from pandas import Index, Series +import pandas._testing as tm + + +class TestSeriesDelItem: + def test_delitem(self): + # GH#5542 + # should delete the item inplace + s = Series(range(5)) + del s[0] + + expected = Series(range(1, 5), index=range(1, 5)) + tm.assert_series_equal(s, expected) + + del s[1] + expected = Series(range(2, 5), index=range(2, 5)) + tm.assert_series_equal(s, expected) + + # only 1 left, del, add, del + s = Series(1) + del s[0] + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="int64"))) + s[0] = 1 + tm.assert_series_equal(s, Series(1)) + del s[0] + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="int64"))) + + def test_delitem_object_index(self): + # Index(dtype=object) + s = Series(1, index=["a"]) + del s["a"] + tm.assert_series_equal( + s, Series(dtype="int64", index=Index([], dtype="object")) + ) + s["a"] = 1 + tm.assert_series_equal(s, Series(1, index=["a"])) + del s["a"] + tm.assert_series_equal( + s, Series(dtype="int64", index=Index([], dtype="object")) + ) + + def test_delitem_missing_key(self): + # empty + s = Series(dtype=object) + + with pytest.raises(KeyError, match=r"^0$"): + del s[0] diff --git a/pandas/tests/series/indexing/test_get.py b/pandas/tests/series/indexing/test_get.py index 438b61ed203a3..3371c47fa1b0a 100644 --- a/pandas/tests/series/indexing/test_get.py +++ b/pandas/tests/series/indexing/test_get.py @@ -1,7 +1,9 @@ import numpy as np +import pytest import pandas as pd from pandas import Series +import pandas._testing as tm def test_get(): @@ -132,3 +134,61 @@ def test_get_nan_multiple(): idx = [np.nan, np.nan] assert s.get(idx) is None + + +def test_get_with_default(): + # GH#7725 + d0 = ["a", "b", "c", "d"] + d1 = np.arange(4, dtype="int64") + others = ["e", 10] + + for data, index in ((d0, d1), (d1, d0)): + s = Series(data, index=index) + for i, d in zip(index, data): + assert s.get(i) == d + assert s.get(i, d) == d + assert s.get(i, "z") == d + for other in others: + assert s.get(other, "z") == "z" + assert s.get(other, other) == other + + +@pytest.mark.parametrize( + "arr", + [np.random.randn(10), tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern")], +) +def test_get2(arr): + # TODO: better name, possibly split + # GH#21260 + ser = Series(arr, index=[2 * i for i in range(len(arr))]) + assert ser.get(4) == ser.iloc[2] + + result = ser.get([4, 6]) + expected = ser.iloc[[2, 3]] + tm.assert_series_equal(result, expected) + + result = ser.get(slice(2)) + expected = ser.iloc[[0, 1]] + tm.assert_series_equal(result, expected) + + assert ser.get(-1) is None + assert ser.get(ser.index.max() + 1) is None + + ser = Series(arr[:6], index=list("abcdef")) + assert ser.get("c") == ser.iloc[2] + + result = ser.get(slice("b", "d")) + expected = ser.iloc[[1, 2, 3]] + tm.assert_series_equal(result, expected) + + result = ser.get("Z") + assert result is None + + assert ser.get(4) == ser.iloc[4] + assert ser.get(-1) == ser.iloc[-1] + assert ser.get(len(ser)) is None + + # GH#21257 + ser = Series(arr) + ser2 = ser[::2] + assert ser2.get(1) is None diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py new file mode 100644 index 0000000000000..a49bd6d59d01b --- /dev/null +++ b/pandas/tests/series/indexing/test_getitem.py @@ -0,0 +1,100 @@ +""" +Series.__getitem__ test classes are organized by the type of key passed. +""" +from datetime import datetime + +import numpy as np +import pytest + +from pandas._libs.tslibs import conversion, timezones + +import pandas as pd +from pandas import Series, Timestamp, date_range, period_range +import pandas._testing as tm + + +class TestSeriesGetitemScalars: + + # TODO: better name/GH ref? + def test_getitem_regression(self): + ser = Series(range(5), index=list(range(5))) + result = ser[list(range(5))] + tm.assert_series_equal(result, ser) + + # ------------------------------------------------------------------ + # Series with DatetimeIndex + + @pytest.mark.parametrize("tzstr", ["Europe/Berlin", "dateutil/Europe/Berlin"]) + def test_getitem_pydatetime_tz(self, tzstr): + tz = timezones.maybe_get_tz(tzstr) + + index = date_range( + start="2012-12-24 16:00", end="2012-12-24 18:00", freq="H", tz=tzstr + ) + ts = Series(index=index, data=index.hour) + time_pandas = Timestamp("2012-12-24 17:00", tz=tzstr) + + dt = datetime(2012, 12, 24, 17, 0) + time_datetime = conversion.localize_pydatetime(dt, tz) + assert ts[time_pandas] == ts[time_datetime] + + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) + def test_string_index_alias_tz_aware(self, tz): + rng = date_range("1/1/2000", periods=10, tz=tz) + ser = Series(np.random.randn(len(rng)), index=rng) + + result = ser["1/3/2000"] + tm.assert_almost_equal(result, ser[2]) + + +class TestSeriesGetitemSlices: + def test_getitem_slice_2d(self, datetime_series): + # GH#30588 multi-dimensional indexing deprecated + + # This is currently failing because the test was relying on + # the DeprecationWarning coming through Index.__getitem__. + # We want to implement a warning specifically for Series.__getitem__ + # at which point this will become a Deprecation/FutureWarning + with tm.assert_produces_warning(None): + # GH#30867 Don't want to support this long-term, but + # for now ensure that the warning from Index + # doesn't comes through via Series.__getitem__. + result = datetime_series[:, np.newaxis] + expected = datetime_series.values[:, np.newaxis] + tm.assert_almost_equal(result, expected) + + # FutureWarning from NumPy. + @pytest.mark.filterwarnings("ignore:Using a non-tuple:FutureWarning") + def test_getitem_median_slice_bug(self): + index = date_range("20090415", "20090519", freq="2B") + s = Series(np.random.randn(13), index=index) + + indexer = [slice(6, 7, None)] + with tm.assert_produces_warning(FutureWarning): + # GH#31299 + result = s[indexer] + expected = s[indexer[0]] + tm.assert_series_equal(result, expected) + + +class TestSeriesGetitemListLike: + def test_getitem_intlist_intindex_periodvalues(self): + ser = Series(period_range("2000-01-01", periods=10, freq="D")) + + result = ser[[2, 4]] + exp = pd.Series( + [pd.Period("2000-01-03", freq="D"), pd.Period("2000-01-05", freq="D")], + index=[2, 4], + dtype="Period[D]", + ) + tm.assert_series_equal(result, exp) + assert result.dtype == "Period[D]" + + +def test_getitem_generator(string_series): + gen = (x > 0 for x in string_series) + result = string_series[gen] + result2 = string_series[iter(string_series > 0)] + expected = string_series[string_series > 0] + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, expected) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 18fcbea683dd3..522ed4df96ad2 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -119,15 +119,6 @@ def test_getitem_fancy(string_series, object_series): assert object_series[2] == slice2[1] -def test_getitem_generator(string_series): - gen = (x > 0 for x in string_series) - result = string_series[gen] - result2 = string_series[iter(string_series > 0)] - expected = string_series[string_series > 0] - tm.assert_series_equal(result, expected) - tm.assert_series_equal(result2, expected) - - def test_type_promotion(): # GH12599 s = pd.Series(dtype=object) @@ -188,46 +179,6 @@ def test_getitem_box_float64(datetime_series): assert isinstance(value, np.float64) -@pytest.mark.parametrize( - "arr", - [np.random.randn(10), tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern")], -) -def test_get(arr): - # GH 21260 - s = Series(arr, index=[2 * i for i in range(len(arr))]) - assert s.get(4) == s.iloc[2] - - result = s.get([4, 6]) - expected = s.iloc[[2, 3]] - tm.assert_series_equal(result, expected) - - result = s.get(slice(2)) - expected = s.iloc[[0, 1]] - tm.assert_series_equal(result, expected) - - assert s.get(-1) is None - assert s.get(s.index.max() + 1) is None - - s = Series(arr[:6], index=list("abcdef")) - assert s.get("c") == s.iloc[2] - - result = s.get(slice("b", "d")) - expected = s.iloc[[1, 2, 3]] - tm.assert_series_equal(result, expected) - - result = s.get("Z") - assert result is None - - assert s.get(4) == s.iloc[4] - assert s.get(-1) == s.iloc[-1] - assert s.get(len(s)) is None - - # GH 21257 - s = pd.Series(arr) - s2 = s[::2] - assert s2.get(1) is None - - def test_series_box_timestamp(): rng = pd.date_range("20090415", "20090519", freq="B") ser = Series(rng) @@ -669,11 +620,8 @@ def test_timedelta_assignment(): s = s.reindex(s.index.insert(0, "A")) tm.assert_series_equal(s, Series([np.nan, Timedelta("1 days")], index=["A", "B"])) - result = s.fillna(timedelta(1)) - expected = Series(Timedelta("1 days"), index=["A", "B"]) - tm.assert_series_equal(result, expected) - s.loc["A"] = timedelta(1) + expected = Series(Timedelta("1 days"), index=["A", "B"]) tm.assert_series_equal(s, expected) # GH 14155 @@ -915,11 +863,13 @@ def test_uint_drop(any_int_dtype): tm.assert_series_equal(series, expected) -def test_getitem_2d_no_warning(): - # https://github.com/pandas-dev/pandas/issues/30867 - # Don't want to support this long-term, but - # for now ensure that the warning from Index - # doesn't comes through via Series.__getitem__. - series = pd.Series([1, 2, 3], index=[1, 2, 3]) - with tm.assert_produces_warning(None): - series[:, None] +def test_getitem_unrecognized_scalar(): + # GH#32684 a scalar key that is not recognized by lib.is_scalar + + # a series that might be produced via `frame.dtypes` + ser = pd.Series([1, 2], index=[np.dtype("O"), np.dtype("i8")]) + + key = ser.index[1] + + result = ser[key] + assert result == 2 diff --git a/pandas/tests/series/indexing/test_numeric.py b/pandas/tests/series/indexing/test_numeric.py index 7e73e6366438b..b5bef46e95ec2 100644 --- a/pandas/tests/series/indexing/test_numeric.py +++ b/pandas/tests/series/indexing/test_numeric.py @@ -5,44 +5,6 @@ import pandas._testing as tm -def test_delitem(): - # GH 5542 - # should delete the item inplace - s = Series(range(5)) - del s[0] - - expected = Series(range(1, 5), index=range(1, 5)) - tm.assert_series_equal(s, expected) - - del s[1] - expected = Series(range(2, 5), index=range(2, 5)) - tm.assert_series_equal(s, expected) - - # empty - s = Series(dtype=object) - - with pytest.raises(KeyError, match=r"^0$"): - del s[0] - - # only 1 left, del, add, del - s = Series(1) - del s[0] - tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="int64"))) - s[0] = 1 - tm.assert_series_equal(s, Series(1)) - del s[0] - tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="int64"))) - - # Index(dtype=object) - s = Series(1, index=["a"]) - del s["a"] - tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="object"))) - s["a"] = 1 - tm.assert_series_equal(s, Series(1, index=["a"])) - del s["a"] - tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="object"))) - - def test_slice_float64(): values = np.arange(10.0, 50.0, 2) index = Index(values) @@ -78,12 +40,6 @@ def test_getitem_negative_out_of_bounds(): s[-11] = "foo" -def test_getitem_regression(): - s = Series(range(5), index=list(range(5))) - result = s[list(range(5))] - tm.assert_series_equal(result, s) - - def test_getitem_setitem_slice_bug(): s = Series(range(10), index=list(range(10))) result = s[-12:] diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py new file mode 100644 index 0000000000000..3463de25ad91b --- /dev/null +++ b/pandas/tests/series/indexing/test_setitem.py @@ -0,0 +1,19 @@ +import numpy as np + +from pandas import NaT, Series, date_range + + +class TestSetitemDT64Values: + def test_setitem_none_nan(self): + series = Series(date_range("1/1/2000", periods=10)) + series[3] = None + assert series[3] is NaT + + series[3:5] = None + assert series[4] is NaT + + series[5] = np.nan + assert series[5] is NaT + + series[5:7] = np.nan + assert series[6] is NaT diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index 9703f5afaf689..6765d9f9d8266 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -435,3 +435,11 @@ def test_where_dt_tz_values(tz_naive_fixture): pd.DatetimeIndex(["20150101", "20150102", "20160516"], tz=tz_naive_fixture) ) tm.assert_series_equal(exp, result) + + +def test_where_sparse(): + # GH#17198 make sure we dont get an AttributeError for sp_index + ser = pd.Series(pd.arrays.SparseArray([1, 2])) + result = ser.where(ser >= 2, 0) + expected = pd.Series(pd.arrays.SparseArray([0, 2])) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_align.py b/pandas/tests/series/methods/test_align.py new file mode 100644 index 0000000000000..974ba5d1e35a7 --- /dev/null +++ b/pandas/tests/series/methods/test_align.py @@ -0,0 +1,182 @@ +import numpy as np +import pytest +import pytz + +import pandas as pd +from pandas import Series, date_range, period_range +import pandas._testing as tm + + +@pytest.mark.parametrize( + "first_slice,second_slice", + [ + [[2, None], [None, -5]], + [[None, 0], [None, -5]], + [[None, -5], [None, 0]], + [[None, 0], [None, 0]], + ], +) +@pytest.mark.parametrize("fill", [None, -1]) +def test_align(datetime_series, first_slice, second_slice, join_type, fill): + a = datetime_series[slice(*first_slice)] + b = datetime_series[slice(*second_slice)] + + aa, ab = a.align(b, join=join_type, fill_value=fill) + + join_index = a.index.join(b.index, how=join_type) + if fill is not None: + diff_a = aa.index.difference(join_index) + diff_b = ab.index.difference(join_index) + if len(diff_a) > 0: + assert (aa.reindex(diff_a) == fill).all() + if len(diff_b) > 0: + assert (ab.reindex(diff_b) == fill).all() + + ea = a.reindex(join_index) + eb = b.reindex(join_index) + + if fill is not None: + ea = ea.fillna(fill) + eb = eb.fillna(fill) + + tm.assert_series_equal(aa, ea) + tm.assert_series_equal(ab, eb) + assert aa.name == "ts" + assert ea.name == "ts" + assert ab.name == "ts" + assert eb.name == "ts" + + +@pytest.mark.parametrize( + "first_slice,second_slice", + [ + [[2, None], [None, -5]], + [[None, 0], [None, -5]], + [[None, -5], [None, 0]], + [[None, 0], [None, 0]], + ], +) +@pytest.mark.parametrize("method", ["pad", "bfill"]) +@pytest.mark.parametrize("limit", [None, 1]) +def test_align_fill_method( + datetime_series, first_slice, second_slice, join_type, method, limit +): + a = datetime_series[slice(*first_slice)] + b = datetime_series[slice(*second_slice)] + + aa, ab = a.align(b, join=join_type, method=method, limit=limit) + + join_index = a.index.join(b.index, how=join_type) + ea = a.reindex(join_index) + eb = b.reindex(join_index) + + ea = ea.fillna(method=method, limit=limit) + eb = eb.fillna(method=method, limit=limit) + + tm.assert_series_equal(aa, ea) + tm.assert_series_equal(ab, eb) + + +def test_align_nocopy(datetime_series): + b = datetime_series[:5].copy() + + # do copy + a = datetime_series.copy() + ra, _ = a.align(b, join="left") + ra[:5] = 5 + assert not (a[:5] == 5).any() + + # do not copy + a = datetime_series.copy() + ra, _ = a.align(b, join="left", copy=False) + ra[:5] = 5 + assert (a[:5] == 5).all() + + # do copy + a = datetime_series.copy() + b = datetime_series[:5].copy() + _, rb = a.align(b, join="right") + rb[:3] = 5 + assert not (b[:3] == 5).any() + + # do not copy + a = datetime_series.copy() + b = datetime_series[:5].copy() + _, rb = a.align(b, join="right", copy=False) + rb[:2] = 5 + assert (b[:2] == 5).all() + + +def test_align_same_index(datetime_series): + a, b = datetime_series.align(datetime_series, copy=False) + assert a.index is datetime_series.index + assert b.index is datetime_series.index + + a, b = datetime_series.align(datetime_series, copy=True) + assert a.index is not datetime_series.index + assert b.index is not datetime_series.index + + +def test_align_multiindex(): + # GH 10665 + + midx = pd.MultiIndex.from_product( + [range(2), range(3), range(2)], names=("a", "b", "c") + ) + idx = pd.Index(range(2), name="b") + s1 = pd.Series(np.arange(12, dtype="int64"), index=midx) + s2 = pd.Series(np.arange(2, dtype="int64"), index=idx) + + # these must be the same results (but flipped) + res1l, res1r = s1.align(s2, join="left") + res2l, res2r = s2.align(s1, join="right") + + expl = s1 + tm.assert_series_equal(expl, res1l) + tm.assert_series_equal(expl, res2r) + expr = pd.Series([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx) + tm.assert_series_equal(expr, res1r) + tm.assert_series_equal(expr, res2l) + + res1l, res1r = s1.align(s2, join="right") + res2l, res2r = s2.align(s1, join="left") + + exp_idx = pd.MultiIndex.from_product( + [range(2), range(2), range(2)], names=("a", "b", "c") + ) + expl = pd.Series([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) + tm.assert_series_equal(expl, res1l) + tm.assert_series_equal(expl, res2r) + expr = pd.Series([0, 0, 1, 1] * 2, index=exp_idx) + tm.assert_series_equal(expr, res1r) + tm.assert_series_equal(expr, res2l) + + +@pytest.mark.parametrize("method", ["backfill", "bfill", "pad", "ffill", None]) +def test_align_with_dataframe_method(method): + # GH31788 + ser = pd.Series(range(3), index=range(3)) + df = pd.DataFrame(0.0, index=range(3), columns=range(3)) + + result_ser, result_df = ser.align(df, method=method) + tm.assert_series_equal(result_ser, ser) + tm.assert_frame_equal(result_df, df) + + +def test_align_dt64tzindex_mismatched_tzs(): + idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern") + ser = Series(np.random.randn(len(idx1)), index=idx1) + ser_central = ser.tz_convert("US/Central") + # different timezones convert to UTC + + new1, new2 = ser.align(ser_central) + assert new1.index.tz == pytz.UTC + assert new2.index.tz == pytz.UTC + + +def test_align_periodindex(join_type): + rng = period_range("1/1/2000", "1/1/2010", freq="A") + ts = Series(np.random.randn(len(rng)), index=rng) + + # TODO: assert something? + ts.align(ts[::2], join=join_type) diff --git a/pandas/tests/series/methods/test_append.py b/pandas/tests/series/methods/test_append.py index 4742d6ae3544f..158c759fdaef3 100644 --- a/pandas/tests/series/methods/test_append.py +++ b/pandas/tests/series/methods/test_append.py @@ -61,15 +61,15 @@ def test_append_tuples(self): tm.assert_series_equal(expected, result) - def test_append_dataframe_regression(self): - # GH 30975 - df = pd.DataFrame({"A": [1, 2]}) - result = df.A.append([df]) - expected = pd.DataFrame( - {0: [1.0, 2.0, None, None], "A": [None, None, 1.0, 2.0]}, index=[0, 1, 0, 1] - ) - - tm.assert_frame_equal(expected, result) + def test_append_dataframe_raises(self): + # GH 31413 + df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) + + msg = "to_append should be a Series or list/tuple of Series, got DataFrame" + with pytest.raises(TypeError, match=msg): + df.A.append(df) + with pytest.raises(TypeError, match=msg): + df.A.append([df]) class TestSeriesAppendWithDatetimeIndex: diff --git a/pandas/tests/series/methods/test_argsort.py b/pandas/tests/series/methods/test_argsort.py index c7fe6ed19a2eb..4353eb4c8cd64 100644 --- a/pandas/tests/series/methods/test_argsort.py +++ b/pandas/tests/series/methods/test_argsort.py @@ -49,8 +49,8 @@ def test_argsort_stable(self): mexpected = np.argsort(s.values, kind="mergesort") qexpected = np.argsort(s.values, kind="quicksort") - tm.assert_series_equal(mindexer, Series(mexpected), check_dtype=False) - tm.assert_series_equal(qindexer, Series(qexpected), check_dtype=False) + tm.assert_series_equal(mindexer.astype(np.intp), Series(mexpected)) + tm.assert_series_equal(qindexer.astype(np.intp), Series(qexpected)) msg = ( r"ndarray Expected type , " r"found instead" diff --git a/pandas/tests/series/methods/test_asof.py b/pandas/tests/series/methods/test_asof.py index b121efd202744..ad5a2de6eabac 100644 --- a/pandas/tests/series/methods/test_asof.py +++ b/pandas/tests/series/methods/test_asof.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._libs.tslibs import IncompatibleFrequency + from pandas import Series, Timestamp, date_range, isna, notna, offsets import pandas._testing as tm @@ -12,7 +14,7 @@ def test_basic(self): N = 50 rng = date_range("1/1/1990", periods=N, freq="53s") ts = Series(np.random.randn(N), index=rng) - ts[15:30] = np.nan + ts.iloc[15:30] = np.nan dates = date_range("1/1/1990", periods=N * 3, freq="25s") result = ts.asof(dates) @@ -37,8 +39,8 @@ def test_scalar(self): N = 30 rng = date_range("1/1/1990", periods=N, freq="53s") ts = Series(np.arange(N), index=rng) - ts[5:10] = np.NaN - ts[15:20] = np.NaN + ts.iloc[5:10] = np.NaN + ts.iloc[15:20] = np.NaN val1 = ts.asof(ts.index[7]) val2 = ts.asof(ts.index[19]) @@ -94,7 +96,7 @@ def test_periodindex(self): N = 50 rng = period_range("1/1/1990", periods=N, freq="H") ts = Series(np.random.randn(N), index=rng) - ts[15:30] = np.nan + ts.iloc[15:30] = np.nan dates = date_range("1/1/1990", periods=N * 3, freq="37min") result = ts.asof(dates) @@ -112,8 +114,8 @@ def test_periodindex(self): rs = result[mask] assert (rs == ts[lb]).all() - ts[5:10] = np.nan - ts[15:20] = np.nan + ts.iloc[5:10] = np.nan + ts.iloc[15:20] = np.nan val1 = ts.asof(ts.index[7]) val2 = ts.asof(ts.index[19]) @@ -132,6 +134,11 @@ def test_periodindex(self): d = ts.index[0].to_timestamp() - offsets.BDay() assert isna(ts.asof(d)) + # Mismatched freq + msg = "Input has different freq" + with pytest.raises(IncompatibleFrequency, match=msg): + ts.asof(rng.asfreq("D")) + def test_errors(self): s = Series( diff --git a/pandas/tests/series/methods/test_autocorr.py b/pandas/tests/series/methods/test_autocorr.py new file mode 100644 index 0000000000000..05e3540a7e702 --- /dev/null +++ b/pandas/tests/series/methods/test_autocorr.py @@ -0,0 +1,30 @@ +import numpy as np + + +class TestAutoCorr: + def test_autocorr(self, datetime_series): + # Just run the function + corr1 = datetime_series.autocorr() + + # Now run it with the lag parameter + corr2 = datetime_series.autocorr(lag=1) + + # corr() with lag needs Series of at least length 2 + if len(datetime_series) <= 2: + assert np.isnan(corr1) + assert np.isnan(corr2) + else: + assert corr1 == corr2 + + # Choose a random lag between 1 and length of Series - 2 + # and compare the result with the Series corr() function + n = 1 + np.random.randint(max(1, len(datetime_series) - 2)) + corr1 = datetime_series.corr(datetime_series.shift(n)) + corr2 = datetime_series.autocorr(lag=n) + + # corr() with lag needs Series of at least length 2 + if len(datetime_series) <= 2: + assert np.isnan(corr1) + assert np.isnan(corr2) + else: + assert corr1 == corr2 diff --git a/pandas/tests/series/methods/test_between_time.py b/pandas/tests/series/methods/test_between_time.py index 3fa26afe77a1d..e9d2f8e6f1637 100644 --- a/pandas/tests/series/methods/test_between_time.py +++ b/pandas/tests/series/methods/test_between_time.py @@ -139,6 +139,6 @@ def test_between_time_axis(self): assert len(ts.between_time(stime, etime)) == expected_length assert len(ts.between_time(stime, etime, axis=0)) == expected_length - msg = "No axis named 1 for object type " + msg = "No axis named 1 for object type Series" with pytest.raises(ValueError, match=msg): ts.between_time(stime, etime, axis=1) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 17527a09f07a1..dd4bf642e68e8 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.core.dtypes.common import is_interval_dtype + import pandas as pd import pandas._testing as tm @@ -266,7 +268,12 @@ def test_convert_dtypes(self, data, maindtype, params, answerdict): # Test that it is a copy copy = series.copy(deep=True) - ns[ns.notna()] = np.nan + if is_interval_dtype(ns.dtype) and ns.dtype.subtype.kind in ["i", "u"]: + msg = "Cannot set float NaN to integer-backed IntervalArray" + with pytest.raises(ValueError, match=msg): + ns[ns.notna()] = np.nan + else: + ns[ns.notna()] = np.nan # Make sure original not changed tm.assert_series_equal(series, copy) @@ -279,3 +286,8 @@ def test_convert_string_dtype(self): ) result = df.convert_dtypes() tm.assert_frame_equal(df, result) + + def test_convert_bool_dtype(self): + # GH32287 + df = pd.DataFrame({"A": pd.array([True])}) + tm.assert_frame_equal(df, df.convert_dtypes()) diff --git a/pandas/tests/series/methods/test_drop.py b/pandas/tests/series/methods/test_drop.py new file mode 100644 index 0000000000000..197fe9ff68df2 --- /dev/null +++ b/pandas/tests/series/methods/test_drop.py @@ -0,0 +1,87 @@ +import pytest + +import pandas as pd +from pandas import Series +import pandas._testing as tm + + +@pytest.mark.parametrize( + "data, index, drop_labels, axis, expected_data, expected_index", + [ + # Unique Index + ([1, 2], ["one", "two"], ["two"], 0, [1], ["one"]), + ([1, 2], ["one", "two"], ["two"], "rows", [1], ["one"]), + ([1, 1, 2], ["one", "two", "one"], ["two"], 0, [1, 2], ["one", "one"]), + # GH 5248 Non-Unique Index + ([1, 1, 2], ["one", "two", "one"], "two", 0, [1, 2], ["one", "one"]), + ([1, 1, 2], ["one", "two", "one"], ["one"], 0, [1], ["two"]), + ([1, 1, 2], ["one", "two", "one"], "one", 0, [1], ["two"]), + ], +) +def test_drop_unique_and_non_unique_index( + data, index, axis, drop_labels, expected_data, expected_index +): + + s = Series(data=data, index=index) + result = s.drop(drop_labels, axis=axis) + expected = Series(data=expected_data, index=expected_index) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "data, index, drop_labels, axis, error_type, error_desc", + [ + # single string/tuple-like + (range(3), list("abc"), "bc", 0, KeyError, "not found in axis"), + # bad axis + (range(3), list("abc"), ("a",), 0, KeyError, "not found in axis"), + (range(3), list("abc"), "one", "columns", ValueError, "No axis named columns"), + ], +) +def test_drop_exception_raised(data, index, drop_labels, axis, error_type, error_desc): + ser = Series(data, index=index) + with pytest.raises(error_type, match=error_desc): + ser.drop(drop_labels, axis=axis) + + +def test_drop_with_ignore_errors(): + # errors='ignore' + s = Series(range(3), index=list("abc")) + result = s.drop("bc", errors="ignore") + tm.assert_series_equal(result, s) + result = s.drop(["a", "d"], errors="ignore") + expected = s.iloc[1:] + tm.assert_series_equal(result, expected) + + # GH 8522 + s = Series([2, 3], index=[True, False]) + assert s.index.is_object() + result = s.drop(True) + expected = Series([3], index=[False]) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("index", [[1, 2, 3], [1, 1, 3]]) +@pytest.mark.parametrize("drop_labels", [[], [1], [3]]) +def test_drop_empty_list(index, drop_labels): + # GH 21494 + expected_index = [i for i in index if i not in drop_labels] + series = pd.Series(index=index, dtype=object).drop(drop_labels) + expected = pd.Series(index=expected_index, dtype=object) + tm.assert_series_equal(series, expected) + + +@pytest.mark.parametrize( + "data, index, drop_labels", + [ + (None, [1, 2, 3], [1, 4]), + (None, [1, 2, 2], [1, 4]), + ([2, 3], [0, 1], [False, True]), + ], +) +def test_drop_non_empty_list(data, index, drop_labels): + # GH 21494 and GH 16877 + dtype = object if data is None else None + ser = pd.Series(data=data, index=index, dtype=dtype) + with pytest.raises(KeyError, match="not found in axis"): + ser.drop(drop_labels) diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py index 2d052505d5ecc..a4532ebb3d8c5 100644 --- a/pandas/tests/series/methods/test_drop_duplicates.py +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -44,17 +44,37 @@ def test_drop_duplicates_bool(keep, expected): tm.assert_series_equal(sc, tc[~expected]) +@pytest.mark.parametrize("values", [[], list(range(5))]) +def test_drop_duplicates_no_duplicates(any_numpy_dtype, keep, values): + tc = Series(values, dtype=np.dtype(any_numpy_dtype)) + expected = Series([False] * len(tc), dtype="bool") + + if tc.dtype == "bool": + # 0 -> False and 1-> True + # any other value would be duplicated + tc = tc[:2] + expected = expected[:2] + + tm.assert_series_equal(tc.duplicated(keep=keep), expected) + + result_dropped = tc.drop_duplicates(keep=keep) + tm.assert_series_equal(result_dropped, tc) + + # validate shallow copy + assert result_dropped is not tc + + class TestSeriesDropDuplicates: @pytest.mark.parametrize( "dtype", ["int_", "uint", "float_", "unicode_", "timedelta64[h]", "datetime64[D]"], ) - def test_drop_duplicates_categorical_non_bool(self, dtype, ordered_fixture): + def test_drop_duplicates_categorical_non_bool(self, dtype, ordered): cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) # Test case 1 input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype)) - tc1 = Series(Categorical(input1, categories=cat_array, ordered=ordered_fixture)) + tc1 = Series(Categorical(input1, categories=cat_array, ordered=ordered)) if dtype == "datetime64[D]": # pre-empty flaky xfail, tc1 values are seemingly-random if not (np.array(tc1) == input1).all(): @@ -83,7 +103,7 @@ def test_drop_duplicates_categorical_non_bool(self, dtype, ordered_fixture): # Test case 2 input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype)) - tc2 = Series(Categorical(input2, categories=cat_array, ordered=ordered_fixture)) + tc2 = Series(Categorical(input2, categories=cat_array, ordered=ordered)) if dtype == "datetime64[D]": # pre-empty flaky xfail, tc2 values are seemingly-random if not (np.array(tc2) == input2).all(): @@ -110,12 +130,10 @@ def test_drop_duplicates_categorical_non_bool(self, dtype, ordered_fixture): sc.drop_duplicates(keep=False, inplace=True) tm.assert_series_equal(sc, tc2[~expected]) - def test_drop_duplicates_categorical_bool(self, ordered_fixture): + def test_drop_duplicates_categorical_bool(self, ordered): tc = Series( Categorical( - [True, False, True, False], - categories=[True, False], - ordered=ordered_fixture, + [True, False, True, False], categories=[True, False], ordered=ordered, ) ) diff --git a/pandas/tests/series/methods/test_droplevel.py b/pandas/tests/series/methods/test_droplevel.py new file mode 100644 index 0000000000000..435eb5751de4b --- /dev/null +++ b/pandas/tests/series/methods/test_droplevel.py @@ -0,0 +1,19 @@ +import pytest + +from pandas import MultiIndex, Series +import pandas._testing as tm + + +class TestDropLevel: + def test_droplevel(self): + # GH#20342 + ser = Series([1, 2, 3, 4]) + ser.index = MultiIndex.from_arrays( + [(1, 2, 3, 4), (5, 6, 7, 8)], names=["a", "b"] + ) + expected = ser.reset_index("b", drop=True) + result = ser.droplevel("b", axis="index") + tm.assert_series_equal(result, expected) + # test that droplevel raises ValueError on axis != 0 + with pytest.raises(ValueError): + ser.droplevel(1, axis="columns") diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py new file mode 100644 index 0000000000000..c34838be24fc1 --- /dev/null +++ b/pandas/tests/series/methods/test_fillna.py @@ -0,0 +1,176 @@ +from datetime import timedelta + +import numpy as np +import pytest + +from pandas import Categorical, DataFrame, NaT, Period, Series, Timedelta, Timestamp +import pandas._testing as tm + + +class TestSeriesFillNA: + def test_fillna_pytimedelta(self): + # GH#8209 + ser = Series([np.nan, Timedelta("1 days")], index=["A", "B"]) + + result = ser.fillna(timedelta(1)) + expected = Series(Timedelta("1 days"), index=["A", "B"]) + tm.assert_series_equal(result, expected) + + def test_fillna_period(self): + # GH#13737 + ser = Series([Period("2011-01", freq="M"), Period("NaT", freq="M")]) + + res = ser.fillna(Period("2012-01", freq="M")) + exp = Series([Period("2011-01", freq="M"), Period("2012-01", freq="M")]) + tm.assert_series_equal(res, exp) + assert res.dtype == "Period[M]" + + def test_fillna_dt64_timestamp(self): + ser = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130102"), + Timestamp("20130103 9:01:01"), + ] + ) + ser[2] = np.nan + + # reg fillna + result = ser.fillna(Timestamp("20130104")) + expected = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130104"), + Timestamp("20130103 9:01:01"), + ] + ) + tm.assert_series_equal(result, expected) + + result = ser.fillna(NaT) + expected = ser + tm.assert_series_equal(result, expected) + + def test_fillna_dt64_non_nao(self): + # GH#27419 + ser = Series([Timestamp("2010-01-01"), NaT, Timestamp("2000-01-01")]) + val = np.datetime64("1975-04-05", "ms") + + result = ser.fillna(val) + expected = Series( + [Timestamp("2010-01-01"), Timestamp("1975-04-05"), Timestamp("2000-01-01")] + ) + tm.assert_series_equal(result, expected) + + def test_fillna_numeric_inplace(self): + x = Series([np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"]) + y = x.copy() + + y.fillna(value=0, inplace=True) + + expected = x.fillna(value=0) + tm.assert_series_equal(y, expected) + + # --------------------------------------------------------------- + # CategoricalDtype + + @pytest.mark.parametrize( + "fill_value, expected_output", + [ + ("a", ["a", "a", "b", "a", "a"]), + ({1: "a", 3: "b", 4: "b"}, ["a", "a", "b", "b", "b"]), + ({1: "a"}, ["a", "a", "b", np.nan, np.nan]), + ({1: "a", 3: "b"}, ["a", "a", "b", "b", np.nan]), + (Series("a"), ["a", np.nan, "b", np.nan, np.nan]), + (Series("a", index=[1]), ["a", "a", "b", np.nan, np.nan]), + (Series({1: "a", 3: "b"}), ["a", "a", "b", "b", np.nan]), + (Series(["a", "b"], index=[3, 4]), ["a", np.nan, "b", "a", "b"]), + ], + ) + def test_fillna_categorical(self, fill_value, expected_output): + # GH#17033 + # Test fillna for a Categorical series + data = ["a", np.nan, "b", np.nan, np.nan] + ser = Series(Categorical(data, categories=["a", "b"])) + exp = Series(Categorical(expected_output, categories=["a", "b"])) + result = ser.fillna(fill_value) + tm.assert_series_equal(result, exp) + + @pytest.mark.parametrize( + "fill_value, expected_output", + [ + (Series(["a", "b", "c", "d", "e"]), ["a", "b", "b", "d", "e"]), + (Series(["b", "d", "a", "d", "a"]), ["a", "d", "b", "d", "a"]), + ( + Series( + Categorical( + ["b", "d", "a", "d", "a"], categories=["b", "c", "d", "e", "a"] + ) + ), + ["a", "d", "b", "d", "a"], + ), + ], + ) + def test_fillna_categorical_with_new_categories(self, fill_value, expected_output): + # GH#26215 + data = ["a", np.nan, "b", np.nan, np.nan] + ser = Series(Categorical(data, categories=["a", "b", "c", "d", "e"])) + exp = Series(Categorical(expected_output, categories=["a", "b", "c", "d", "e"])) + result = ser.fillna(fill_value) + tm.assert_series_equal(result, exp) + + def test_fillna_categorical_raises(self): + data = ["a", np.nan, "b", np.nan, np.nan] + ser = Series(Categorical(data, categories=["a", "b"])) + + with pytest.raises(ValueError, match="fill value must be in categories"): + ser.fillna("d") + + with pytest.raises(ValueError, match="fill value must be in categories"): + ser.fillna(Series("d")) + + with pytest.raises(ValueError, match="fill value must be in categories"): + ser.fillna({1: "d", 3: "a"}) + + msg = '"value" parameter must be a scalar or dict, but you passed a "list"' + with pytest.raises(TypeError, match=msg): + ser.fillna(["a", "b"]) + + msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"' + with pytest.raises(TypeError, match=msg): + ser.fillna(("a", "b")) + + msg = ( + '"value" parameter must be a scalar, dict ' + 'or Series, but you passed a "DataFrame"' + ) + with pytest.raises(TypeError, match=msg): + ser.fillna(DataFrame({1: ["a"], 3: ["b"]})) + + # --------------------------------------------------------------- + # Invalid Usages + + def test_fillna_listlike_invalid(self): + ser = Series(np.random.randint(-100, 100, 50)) + msg = '"value" parameter must be a scalar or dict, but you passed a "list"' + with pytest.raises(TypeError, match=msg): + ser.fillna([1, 2]) + + msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"' + with pytest.raises(TypeError, match=msg): + ser.fillna((1, 2)) + + def test_fillna_method_and_limit_invalid(self): + + # related GH#9217, make sure limit is an int and greater than 0 + ser = Series([1, 2, 3, None]) + msg = ( + r"Cannot specify both 'value' and 'method'\.|" + r"Limit must be greater than 0|" + "Limit must be an integer" + ) + for limit in [-1, 0, 1.0, 2.0]: + for method in ["backfill", "bfill", "pad", "ffill", None]: + with pytest.raises(ValueError, match=msg): + ser.fillna(1, limit=limit, method=method) diff --git a/pandas/tests/series/methods/test_first_and_last.py b/pandas/tests/series/methods/test_first_and_last.py new file mode 100644 index 0000000000000..7629dc8cda30b --- /dev/null +++ b/pandas/tests/series/methods/test_first_and_last.py @@ -0,0 +1,69 @@ +""" +Note: includes tests for `last` +""" + +import numpy as np +import pytest + +from pandas import Series, date_range +import pandas._testing as tm + + +class TestFirst: + def test_first_subset(self): + rng = date_range("1/1/2000", "1/1/2010", freq="12h") + ts = Series(np.random.randn(len(rng)), index=rng) + result = ts.first("10d") + assert len(result) == 20 + + rng = date_range("1/1/2000", "1/1/2010", freq="D") + ts = Series(np.random.randn(len(rng)), index=rng) + result = ts.first("10d") + assert len(result) == 10 + + result = ts.first("3M") + expected = ts[:"3/31/2000"] + tm.assert_series_equal(result, expected) + + result = ts.first("21D") + expected = ts[:21] + tm.assert_series_equal(result, expected) + + result = ts[:0].first("3M") + tm.assert_series_equal(result, ts[:0]) + + def test_first_raises(self): + # GH#20725 + ser = Series("a b c".split()) + msg = "'first' only supports a DatetimeIndex index" + with pytest.raises(TypeError, match=msg): + ser.first("1D") + + def test_last_subset(self): + rng = date_range("1/1/2000", "1/1/2010", freq="12h") + ts = Series(np.random.randn(len(rng)), index=rng) + result = ts.last("10d") + assert len(result) == 20 + + rng = date_range("1/1/2000", "1/1/2010", freq="D") + ts = Series(np.random.randn(len(rng)), index=rng) + result = ts.last("10d") + assert len(result) == 10 + + result = ts.last("21D") + expected = ts["12/12/2009":] + tm.assert_series_equal(result, expected) + + result = ts.last("21D") + expected = ts[-21:] + tm.assert_series_equal(result, expected) + + result = ts[:0].last("3M") + tm.assert_series_equal(result, ts[:0]) + + def test_last_raises(self): + # GH#20725 + ser = Series("a b c".split()) + msg = "'last' only supports a DatetimeIndex index" + with pytest.raises(TypeError, match=msg): + ser.last("1D") diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 3d4688c8274f9..caaffb7d5b61f 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -202,9 +202,7 @@ def test_rank_categorical(self): def test_rank_signature(self): s = Series([0, 1]) s.rank(method="average") - msg = ( - "No axis named average for object type " - ) + msg = "No axis named average for object type Series" with pytest.raises(ValueError, match=msg): s.rank("average") diff --git a/pandas/tests/series/methods/test_reindex_like.py b/pandas/tests/series/methods/test_reindex_like.py new file mode 100644 index 0000000000000..7f24c778feb1b --- /dev/null +++ b/pandas/tests/series/methods/test_reindex_like.py @@ -0,0 +1,41 @@ +from datetime import datetime + +import numpy as np + +from pandas import Series +import pandas._testing as tm + + +def test_reindex_like(datetime_series): + other = datetime_series[::2] + tm.assert_series_equal( + datetime_series.reindex(other.index), datetime_series.reindex_like(other) + ) + + # GH#7179 + day1 = datetime(2013, 3, 5) + day2 = datetime(2013, 5, 5) + day3 = datetime(2014, 3, 5) + + series1 = Series([5, None, None], [day1, day2, day3]) + series2 = Series([None, None], [day1, day3]) + + result = series1.reindex_like(series2, method="pad") + expected = Series([5, np.nan], index=[day1, day3]) + tm.assert_series_equal(result, expected) + + +def test_reindex_like_nearest(): + ser = Series(np.arange(10, dtype="int64")) + + target = [0.1, 0.9, 1.5, 2.0] + other = ser.reindex(target, method="nearest") + expected = Series(np.around(target).astype("int64"), target) + + result = ser.reindex_like(other, method="nearest") + tm.assert_series_equal(expected, result) + + result = ser.reindex_like(other, method="nearest", tolerance=1) + tm.assert_series_equal(expected, result) + result = ser.reindex_like(other, method="nearest", tolerance=[1, 2, 3, 4]) + tm.assert_series_equal(expected, result) diff --git a/pandas/tests/series/methods/test_rename.py b/pandas/tests/series/methods/test_rename.py index 60182f509e657..ac07fed7c951a 100644 --- a/pandas/tests/series/methods/test_rename.py +++ b/pandas/tests/series/methods/test_rename.py @@ -89,3 +89,12 @@ class MyIndexer: s = Series([1, 2, 3]) s.rename(ix, inplace=True) assert s.name is ix + + def test_rename_callable(self): + # GH 17407 + s = Series(range(1, 6), index=Index(range(2, 7), name="IntIndex")) + result = s.rename(str) + expected = s.rename(lambda i: str(i)) + tm.assert_series_equal(result, expected) + + assert result.name == expected.name diff --git a/pandas/tests/series/methods/test_rename_axis.py b/pandas/tests/series/methods/test_rename_axis.py new file mode 100644 index 0000000000000..b519dd1144493 --- /dev/null +++ b/pandas/tests/series/methods/test_rename_axis.py @@ -0,0 +1,43 @@ +import pytest + +from pandas import Index, MultiIndex, Series +import pandas._testing as tm + + +class TestSeriesRenameAxis: + def test_rename_axis_mapper(self): + # GH 19978 + mi = MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["ll", "nn"]) + ser = Series(list(range(len(mi))), index=mi) + + result = ser.rename_axis(index={"ll": "foo"}) + assert result.index.names == ["foo", "nn"] + + result = ser.rename_axis(index=str.upper, axis=0) + assert result.index.names == ["LL", "NN"] + + result = ser.rename_axis(index=["foo", "goo"]) + assert result.index.names == ["foo", "goo"] + + with pytest.raises(TypeError, match="unexpected"): + ser.rename_axis(columns="wrong") + + def test_rename_axis_inplace(self, datetime_series): + # GH 15704 + expected = datetime_series.rename_axis("foo") + result = datetime_series + no_return = result.rename_axis("foo", inplace=True) + + assert no_return is None + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("kwargs", [{"mapper": None}, {"index": None}, {}]) + def test_rename_axis_none(self, kwargs): + # GH 25034 + index = Index(list("abc"), name="foo") + ser = Series([1, 2, 3], index=index) + + result = ser.rename_axis(**kwargs) + expected_index = index.rename(None) if kwargs else index + expected = Series([1, 2, 3], index=expected_index) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 770ad38b0215e..1c54e2b988219 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -108,6 +108,16 @@ def test_replace_gh5319(self): expected = pd.Series([pd.Timestamp.min, ts], dtype=object) tm.assert_series_equal(expected, result) + def test_replace_timedelta_td64(self): + tdi = pd.timedelta_range(0, periods=5) + ser = pd.Series(tdi) + + # Using a single dict argument means we go through replace_list + result = ser.replace({ser[1]: ser[3]}) + + expected = pd.Series([ser[0], ser[3], ser[2], ser[3], ser[4]]) + tm.assert_series_equal(result, expected) + def test_replace_with_single_list(self): ser = pd.Series([0, 1, 2, 3, 4]) result = ser.replace([1, 2, 3]) @@ -241,6 +251,13 @@ def test_replace2(self): assert (ser[6:10] == -1).all() assert (ser[20:30] == -1).all() + def test_replace_with_dictlike_and_string_dtype(self): + # GH 32621 + s = pd.Series(["one", "two", np.nan], dtype="string") + expected = pd.Series(["1", "2", np.nan]) + result = s.replace({"one": "1", "two": "2"}) + tm.assert_series_equal(expected, result) + def test_replace_with_empty_dictlike(self): # GH 15289 s = pd.Series(list("abcd")) @@ -294,7 +311,7 @@ def test_replace_categorical(self, categorical, numeric): s = pd.Series(categorical) result = s.replace({"A": 1, "B": 2}) expected = pd.Series(numeric) - tm.assert_series_equal(expected, result, check_dtype=False) + tm.assert_series_equal(expected, result) def test_replace_categorical_single(self): # GH 26988 @@ -362,3 +379,30 @@ def test_replace_no_cast(self, ser, exp): expected = pd.Series(exp) tm.assert_series_equal(result, expected) + + def test_replace_invalid_to_replace(self): + # GH 18634 + # API: replace() should raise an exception if invalid argument is given + series = pd.Series(["a", "b", "c "]) + msg = ( + r"Expecting 'to_replace' to be either a scalar, array-like, " + r"dict or None, got invalid type.*" + ) + with pytest.raises(TypeError, match=msg): + series.replace(lambda x: x.strip()) + + def test_replace_only_one_dictlike_arg(self): + # GH#33340 + + ser = pd.Series([1, 2, "A", pd.Timestamp.now(), True]) + to_replace = {0: 1, 2: "A"} + value = "foo" + msg = "Series.replace cannot use dict-like to_replace and non-None value" + with pytest.raises(ValueError, match=msg): + ser.replace(to_replace, value) + + to_replace = 1 + value = {0: "foo", 2: "bar"} + msg = "Series.replace cannot use dict-value and non-None to_replace" + with pytest.raises(ValueError, match=msg): + ser.replace(to_replace, value) diff --git a/pandas/tests/series/methods/test_searchsorted.py b/pandas/tests/series/methods/test_searchsorted.py index fd6c6f74a9136..5a6ec0039c7cd 100644 --- a/pandas/tests/series/methods/test_searchsorted.py +++ b/pandas/tests/series/methods/test_searchsorted.py @@ -40,6 +40,14 @@ def test_searchsorted_datetime64_scalar(self): assert is_scalar(res) assert res == 1 + def test_searchsorted_datetime64_scalar_mixed_timezones(self): + # GH 30086 + ser = Series(date_range("20120101", periods=10, freq="2D", tz="UTC")) + val = Timestamp("20120102", tz="America/New_York") + res = ser.searchsorted(val) + assert is_scalar(res) + assert res == 1 + def test_searchsorted_datetime64_list(self): ser = Series(date_range("20120101", periods=10, freq="2D")) vals = [Timestamp("20120102"), Timestamp("20120104")] diff --git a/pandas/tests/series/methods/test_shift.py b/pandas/tests/series/methods/test_shift.py index 8256e2f33b936..e8d7f5958d0a1 100644 --- a/pandas/tests/series/methods/test_shift.py +++ b/pandas/tests/series/methods/test_shift.py @@ -263,3 +263,13 @@ def test_shift_categorical(self): tm.assert_index_equal(s.values.categories, sp1.values.categories) tm.assert_index_equal(s.values.categories, sn2.values.categories) + + def test_shift_dt64values_int_fill_deprecated(self): + # GH#31971 + ser = pd.Series([pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02")]) + + with tm.assert_produces_warning(FutureWarning): + result = ser.shift(1, fill_value=0) + + expected = pd.Series([pd.Timestamp(0), ser[0]]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_sort_index.py b/pandas/tests/series/methods/test_sort_index.py index 6fa4eeaee34c0..2d4fdfd5a3950 100644 --- a/pandas/tests/series/methods/test_sort_index.py +++ b/pandas/tests/series/methods/test_sort_index.py @@ -8,6 +8,10 @@ class TestSeriesSortIndex: + def test_sort_index_name(self, datetime_series): + result = datetime_series.sort_index(ascending=False) + assert result.name == datetime_series.name + def test_sort_index(self, datetime_series): rindex = list(datetime_series.index) random.shuffle(rindex) @@ -30,7 +34,7 @@ def test_sort_index(self, datetime_series): sorted_series = random_order.sort_index(axis=0) tm.assert_series_equal(sorted_series, datetime_series) - msg = "No axis named 1 for object type " + msg = "No axis named 1 for object type Series" with pytest.raises(ValueError, match=msg): random_order.sort_values(axis=1) diff --git a/pandas/tests/series/methods/test_to_period.py b/pandas/tests/series/methods/test_to_period.py new file mode 100644 index 0000000000000..28c4aad3edf32 --- /dev/null +++ b/pandas/tests/series/methods/test_to_period.py @@ -0,0 +1,47 @@ +import numpy as np + +from pandas import ( + DataFrame, + DatetimeIndex, + PeriodIndex, + Series, + date_range, + period_range, +) +import pandas._testing as tm + + +class TestToPeriod: + def test_to_period(self): + rng = date_range("1/1/2000", "1/1/2001", freq="D") + ts = Series(np.random.randn(len(rng)), index=rng) + + pts = ts.to_period() + exp = ts.copy() + exp.index = period_range("1/1/2000", "1/1/2001") + tm.assert_series_equal(pts, exp) + + pts = ts.to_period("M") + exp.index = exp.index.asfreq("M") + tm.assert_index_equal(pts.index, exp.index.asfreq("M")) + tm.assert_series_equal(pts, exp) + + # GH#7606 without freq + idx = DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"]) + exp_idx = PeriodIndex( + ["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], freq="D" + ) + + s = Series(np.random.randn(4), index=idx) + expected = s.copy() + expected.index = exp_idx + tm.assert_series_equal(s.to_period(), expected) + + df = DataFrame(np.random.randn(4, 4), index=idx, columns=idx) + expected = df.copy() + expected.index = exp_idx + tm.assert_frame_equal(df.to_period(), expected) + + expected = df.copy() + expected.columns = exp_idx + tm.assert_frame_equal(df.to_period(axis=1), expected) diff --git a/pandas/tests/series/methods/test_unstack.py b/pandas/tests/series/methods/test_unstack.py index 7645fb8759a54..cdf6a16e88ad0 100644 --- a/pandas/tests/series/methods/test_unstack.py +++ b/pandas/tests/series/methods/test_unstack.py @@ -118,3 +118,20 @@ def test_unstack_mixed_type_name_in_multiindex( expected_values, columns=expected_columns, index=expected_index, ) tm.assert_frame_equal(result, expected) + + +def test_unstack_multi_index_categorical_values(): + + mi = tm.makeTimeDataFrame().stack().index.rename(["major", "minor"]) + ser = pd.Series(["foo"] * len(mi), index=mi, name="category", dtype="category") + + result = ser.unstack() + + dti = ser.index.levels[0] + c = pd.Categorical(["foo"] * len(dti)) + expected = DataFrame( + {"A": c.copy(), "B": c.copy(), "C": c.copy(), "D": c.copy()}, + columns=pd.Index(list("ABCD"), name="minor"), + index=dti.rename("major"), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py index fdb35befeb0c2..f97362ce9c2a9 100644 --- a/pandas/tests/series/methods/test_value_counts.py +++ b/pandas/tests/series/methods/test_value_counts.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pandas as pd from pandas import Categorical, CategoricalIndex, Series @@ -177,3 +178,28 @@ def test_value_counts_categorical_with_nan(self): exp = Series([2, 1, 3], index=CategoricalIndex(["a", "b", np.nan])) res = ser.value_counts(dropna=False, sort=False) tm.assert_series_equal(res, exp) + + @pytest.mark.parametrize( + "ser, dropna, exp", + [ + ( + pd.Series([False, True, True, pd.NA]), + False, + pd.Series([2, 1, 1], index=[True, False, pd.NA]), + ), + ( + pd.Series([False, True, True, pd.NA]), + True, + pd.Series([2, 1], index=[True, False]), + ), + ( + pd.Series(range(3), index=[True, False, np.nan]).index, + False, + pd.Series([1, 1, 1], index=[True, False, pd.NA]), + ), + ], + ) + def test_value_counts_bool_with_nan(self, ser, dropna, exp): + # GH32146 + out = ser.value_counts(dropna=dropna) + tm.assert_series_equal(out, exp) diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index 9be8744d7223f..203750757e28d 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas import Index, MultiIndex, Series +from pandas import Index, Series import pandas._testing as tm @@ -53,115 +53,3 @@ def test_set_index_makes_timeseries(self): s = Series(range(10)) s.index = idx assert s.index.is_all_dates - - def test_reorder_levels(self): - index = MultiIndex( - levels=[["bar"], ["one", "two", "three"], [0, 1]], - codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], - names=["L0", "L1", "L2"], - ) - s = Series(np.arange(6), index=index) - - # no change, position - result = s.reorder_levels([0, 1, 2]) - tm.assert_series_equal(s, result) - - # no change, labels - result = s.reorder_levels(["L0", "L1", "L2"]) - tm.assert_series_equal(s, result) - - # rotate, position - result = s.reorder_levels([1, 2, 0]) - e_idx = MultiIndex( - levels=[["one", "two", "three"], [0, 1], ["bar"]], - codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1], [0, 0, 0, 0, 0, 0]], - names=["L1", "L2", "L0"], - ) - expected = Series(np.arange(6), index=e_idx) - tm.assert_series_equal(result, expected) - - def test_rename_axis_mapper(self): - # GH 19978 - mi = MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["ll", "nn"]) - s = Series(list(range(len(mi))), index=mi) - - result = s.rename_axis(index={"ll": "foo"}) - assert result.index.names == ["foo", "nn"] - - result = s.rename_axis(index=str.upper, axis=0) - assert result.index.names == ["LL", "NN"] - - result = s.rename_axis(index=["foo", "goo"]) - assert result.index.names == ["foo", "goo"] - - with pytest.raises(TypeError, match="unexpected"): - s.rename_axis(columns="wrong") - - def test_rename_axis_inplace(self, datetime_series): - # GH 15704 - expected = datetime_series.rename_axis("foo") - result = datetime_series - no_return = result.rename_axis("foo", inplace=True) - - assert no_return is None - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("kwargs", [{"mapper": None}, {"index": None}, {}]) - def test_rename_axis_none(self, kwargs): - # GH 25034 - index = Index(list("abc"), name="foo") - df = Series([1, 2, 3], index=index) - - result = df.rename_axis(**kwargs) - expected_index = index.rename(None) if kwargs else index - expected = Series([1, 2, 3], index=expected_index) - tm.assert_series_equal(result, expected) - - def test_set_axis_inplace_axes(self, axis_series): - # GH14636 - ser = Series(np.arange(4), index=[1, 3, 5, 7], dtype="int64") - - expected = ser.copy() - expected.index = list("abcd") - - # inplace=True - # The FutureWarning comes from the fact that we would like to have - # inplace default to False some day - result = ser.copy() - result.set_axis(list("abcd"), axis=axis_series, inplace=True) - tm.assert_series_equal(result, expected) - - def test_set_axis_inplace(self): - # GH14636 - - s = Series(np.arange(4), index=[1, 3, 5, 7], dtype="int64") - - expected = s.copy() - expected.index = list("abcd") - - # inplace=False - result = s.set_axis(list("abcd"), axis=0, inplace=False) - tm.assert_series_equal(expected, result) - - # omitting the "axis" parameter - with tm.assert_produces_warning(None): - result = s.set_axis(list("abcd"), inplace=False) - tm.assert_series_equal(result, expected) - - # wrong values for the "axis" parameter - for axis in [2, "foo"]: - with pytest.raises(ValueError, match="No axis named"): - s.set_axis(list("abcd"), axis=axis, inplace=False) - - def test_droplevel(self): - # GH20342 - ser = Series([1, 2, 3, 4]) - ser.index = MultiIndex.from_arrays( - [(1, 2, 3, 4), (5, 6, 7, 8)], names=["a", "b"] - ) - expected = ser.reset_index("b", drop=True) - result = ser.droplevel("b", axis="index") - tm.assert_series_equal(result, expected) - # test that droplevel raises ValueError on axis != 0 - with pytest.raises(ValueError): - ser.droplevel(1, axis="columns") diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 6f45b72154805..ab8618eb0a7d4 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -17,38 +17,6 @@ def test_prod_numpy16_bug(self): assert not isinstance(result, Series) - def test_dot(self): - a = Series(np.random.randn(4), index=["p", "q", "r", "s"]) - b = DataFrame( - np.random.randn(3, 4), index=["1", "2", "3"], columns=["p", "q", "r", "s"] - ).T - - result = a.dot(b) - expected = Series(np.dot(a.values, b.values), index=["1", "2", "3"]) - tm.assert_series_equal(result, expected) - - # Check index alignment - b2 = b.reindex(index=reversed(b.index)) - result = a.dot(b) - tm.assert_series_equal(result, expected) - - # Check ndarray argument - result = a.dot(b.values) - assert np.all(result == expected.values) - tm.assert_almost_equal(a.dot(b["2"].values), expected["2"]) - - # Check series argument - tm.assert_almost_equal(a.dot(b["1"]), expected["1"]) - tm.assert_almost_equal(a.dot(b2["1"]), expected["1"]) - - msg = r"Dot product shape mismatch, \(4,\) vs \(3,\)" - # exception raised is of type Exception - with pytest.raises(Exception, match=msg): - a.dot(a.values[:3]) - msg = "matrices are not aligned" - with pytest.raises(ValueError, match=msg): - a.dot(b.T) - def test_matmul(self): # matmul test is for GH #10259 a = Series(np.random.randn(4), index=["p", "q", "r", "s"]) @@ -209,3 +177,27 @@ def test_validate_stat_keepdims(self): ) with pytest.raises(ValueError, match=msg): np.sum(s, keepdims=True) + + def test_td64_summation_overflow(self): + # GH 9442 + s = pd.Series(pd.date_range("20130101", periods=100000, freq="H")) + s[0] += pd.Timedelta("1s 1ms") + + # mean + result = (s - s.min()).mean() + expected = pd.Timedelta((pd.TimedeltaIndex((s - s.min())).asi8 / len(s)).sum()) + + # the computation is converted to float so + # might be some loss of precision + assert np.allclose(result.value / 1000, expected.value / 1000) + + # sum + msg = "overflow in timedelta operation" + with pytest.raises(ValueError, match=msg): + (s - s.min()).sum() + + s1 = s[0:10000] + with pytest.raises(ValueError, match=msg): + (s1 - s1.min()).sum() + s2 = s[0:1000] + (s2 - s2.min()).sum() diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 3e877cf2fc787..302ca8d1aa43e 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -110,10 +110,6 @@ def _pickle_roundtrip(self, obj): unpickled = pd.read_pickle(path) return unpickled - def test_sort_index_name(self, datetime_series): - result = datetime_series.sort_index(ascending=False) - assert result.name == datetime_series.name - def test_constructor_dict(self): d = {"a": 0.0, "b": 1.0, "c": 2.0} result = Series(d) diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index a4c55a80a9f0f..0661828814888 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -4,10 +4,11 @@ import numpy as np import pytest +from pandas.core.dtypes.generic import ABCMultiIndex + import pandas as pd from pandas import DataFrame, Index, Series, isna import pandas._testing as tm -from pandas.conftest import _get_cython_table_params from pandas.core.base import SpecificationError @@ -354,7 +355,7 @@ def test_non_callable_aggregates(self): @pytest.mark.parametrize( "series, func, expected", chain( - _get_cython_table_params( + tm.get_cython_table_params( Series(dtype=np.float64), [ ("sum", 0), @@ -369,7 +370,7 @@ def test_non_callable_aggregates(self): ("median", np.nan), ], ), - _get_cython_table_params( + tm.get_cython_table_params( Series([np.nan, 1, 2, 3]), [ ("sum", 6), @@ -384,7 +385,7 @@ def test_non_callable_aggregates(self): ("median", 2), ], ), - _get_cython_table_params( + tm.get_cython_table_params( Series("a b c".split()), [ ("sum", "abc"), @@ -409,21 +410,21 @@ def test_agg_cython_table(self, series, func, expected): @pytest.mark.parametrize( "series, func, expected", chain( - _get_cython_table_params( + tm.get_cython_table_params( Series(dtype=np.float64), [ ("cumprod", Series([], Index([]), dtype=np.float64)), ("cumsum", Series([], Index([]), dtype=np.float64)), ], ), - _get_cython_table_params( + tm.get_cython_table_params( Series([np.nan, 1, 2, 3]), [ ("cumprod", Series([np.nan, 1, 2, 6])), ("cumsum", Series([np.nan, 1, 3, 6])), ], ), - _get_cython_table_params( + tm.get_cython_table_params( Series("a b c".split()), [("cumsum", Series(["a", "ab", "abc"]))] ), ), @@ -438,7 +439,7 @@ def test_agg_cython_table_transform(self, series, func, expected): @pytest.mark.parametrize( "series, func, expected", chain( - _get_cython_table_params( + tm.get_cython_table_params( Series("a b c".split()), [ ("mean", TypeError), # mean raises TypeError @@ -514,9 +515,11 @@ def test_map(self, datetime_series): exp = Series([np.nan, "B", "C", "D"]) tm.assert_series_equal(a.map(c), exp) - @pytest.mark.parametrize("index", tm.all_index_generator(10)) - def test_map_empty(self, index): - s = Series(index) + def test_map_empty(self, indices): + if isinstance(indices, ABCMultiIndex): + pytest.skip("Initializing a Series from a MultiIndex is not supported") + + s = Series(indices) result = s.map({}) expected = pd.Series(np.nan, index=s.index) @@ -627,19 +630,19 @@ class DictWithoutMissing(dict): expected = Series([np.nan, np.nan, "three"]) tm.assert_series_equal(result, expected) - def test_map_abc_mapping(self, non_mapping_dict_subclass): + def test_map_abc_mapping(self, non_dict_mapping_subclass): # https://github.com/pandas-dev/pandas/issues/29733 # Check collections.abc.Mapping support as mapper for Series.map s = Series([1, 2, 3]) - not_a_dictionary = non_mapping_dict_subclass({3: "three"}) + not_a_dictionary = non_dict_mapping_subclass({3: "three"}) result = s.map(not_a_dictionary) expected = Series([np.nan, np.nan, "three"]) tm.assert_series_equal(result, expected) - def test_map_abc_mapping_with_missing(self, non_mapping_dict_subclass): + def test_map_abc_mapping_with_missing(self, non_dict_mapping_subclass): # https://github.com/pandas-dev/pandas/issues/29733 # Check collections.abc.Mapping support as mapper for Series.map - class NonDictMappingWithMissing(non_mapping_dict_subclass): + class NonDictMappingWithMissing(non_dict_mapping_subclass): def __missing__(self, key): return "missing" @@ -787,3 +790,25 @@ def test_map_float_to_string_precision(self): result = ser.map(lambda val: str(val)).to_dict() expected = {0: "0.3333333333333333"} assert result == expected + + def test_map_with_invalid_na_action_raises(self): + # https://github.com/pandas-dev/pandas/issues/32815 + s = pd.Series([1, 2, 3]) + msg = "na_action must either be 'ignore' or None" + with pytest.raises(ValueError, match=msg): + s.map(lambda x: x, na_action="____") + + def test_apply_to_timedelta(self): + list_of_valid_strings = ["00:00:01", "00:00:02"] + a = pd.to_timedelta(list_of_valid_strings) + b = Series(list_of_valid_strings).apply(pd.to_timedelta) + # FIXME: dont leave commented-out + # Can't compare until apply on a Series gives the correct dtype + # assert_series_equal(a, b) + + list_of_strings = ["00:00:01", np.nan, pd.NaT, pd.NaT] + + a = pd.to_timedelta(list_of_strings) # noqa + b = Series(list_of_strings).apply(pd.to_timedelta) # noqa + # Can't compare until apply on a Series gives the correct dtype + # assert_series_equal(a, b) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index f3ffdc373e178..a6385240537ca 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -1,13 +1,16 @@ +from datetime import timedelta import operator import numpy as np import pytest +import pytz from pandas._libs.tslibs import IncompatibleFrequency import pandas as pd -from pandas import Series +from pandas import Categorical, Index, Series, bdate_range, date_range, isna import pandas._testing as tm +from pandas.core import nanops, ops def _permute(obj): @@ -63,6 +66,65 @@ def _constructor(self): result = op(m, 1) assert result.x == 42 + def test_flex_add_scalar_fill_value(self): + # GH12723 + s = Series([0, 1, np.nan, 3, 4, 5]) + + exp = s.fillna(0).add(2) + res = s.add(2, fill_value=0) + tm.assert_series_equal(res, exp) + + pairings = [(Series.div, operator.truediv, 1), (Series.rdiv, ops.rtruediv, 1)] + for op in ["add", "sub", "mul", "pow", "truediv", "floordiv"]: + fv = 0 + lop = getattr(Series, op) + lequiv = getattr(operator, op) + rop = getattr(Series, "r" + op) + # bind op at definition time... + requiv = lambda x, y, op=op: getattr(operator, op)(y, x) + pairings.append((lop, lequiv, fv)) + pairings.append((rop, requiv, fv)) + + @pytest.mark.parametrize("op, equiv_op, fv", pairings) + def test_operators_combine(self, op, equiv_op, fv): + def _check_fill(meth, op, a, b, fill_value=0): + exp_index = a.index.union(b.index) + a = a.reindex(exp_index) + b = b.reindex(exp_index) + + amask = isna(a) + bmask = isna(b) + + exp_values = [] + for i in range(len(exp_index)): + with np.errstate(all="ignore"): + if amask[i]: + if bmask[i]: + exp_values.append(np.nan) + continue + exp_values.append(op(fill_value, b[i])) + elif bmask[i]: + if amask[i]: + exp_values.append(np.nan) + continue + exp_values.append(op(a[i], fill_value)) + else: + exp_values.append(op(a[i], b[i])) + + result = meth(a, b, fill_value=fill_value) + expected = Series(exp_values, exp_index) + tm.assert_series_equal(result, expected) + + a = Series([np.nan, 1.0, 2.0, 3.0, np.nan], index=np.arange(5)) + b = Series([np.nan, 1, np.nan, 3, np.nan, 4.0], index=np.arange(6)) + + result = op(a, b) + exp = equiv_op(a, b) + tm.assert_series_equal(result, exp) + _check_fill(op, equiv_op, a, b, fill_value=fv) + # should accept axis=0 or axis='rows' + op(a, b, axis=0) + class TestSeriesArithmetic: # Some of these may end up in tests/arithmetic, but are not yet sorted @@ -73,7 +135,7 @@ def test_add_series_with_period_index(self): result = ts + ts[::2] expected = ts + ts - expected[1::2] = np.nan + expected.iloc[1::2] = np.nan tm.assert_series_equal(result, expected) result = ts + _permute(ts[::2]) @@ -98,6 +160,100 @@ def test_string_addition(self, target_add, input_value, expected_value): expected = Series(expected_value) tm.assert_series_equal(result, expected) + def test_divmod(self): + # GH#25557 + a = Series([1, 1, 1, np.nan], index=["a", "b", "c", "d"]) + b = Series([2, np.nan, 1, np.nan], index=["a", "b", "d", "e"]) + + result = a.divmod(b) + expected = divmod(a, b) + tm.assert_series_equal(result[0], expected[0]) + tm.assert_series_equal(result[1], expected[1]) + + result = a.rdivmod(b) + expected = divmod(b, a) + tm.assert_series_equal(result[0], expected[0]) + tm.assert_series_equal(result[1], expected[1]) + + @pytest.mark.parametrize("index", [None, range(9)]) + def test_series_integer_mod(self, index): + # GH#24396 + s1 = Series(range(1, 10)) + s2 = Series("foo", index=index) + + msg = "not all arguments converted during string formatting" + + with pytest.raises(TypeError, match=msg): + s2 % s1 + + def test_add_with_duplicate_index(self): + # GH14227 + s1 = Series([1, 2], index=[1, 1]) + s2 = Series([10, 10], index=[1, 2]) + result = s1 + s2 + expected = pd.Series([11, 12, np.nan], index=[1, 1, 2]) + tm.assert_series_equal(result, expected) + + def test_add_na_handling(self): + from decimal import Decimal + from datetime import date + + s = Series( + [Decimal("1.3"), Decimal("2.3")], index=[date(2012, 1, 1), date(2012, 1, 2)] + ) + + result = s + s.shift(1) + result2 = s.shift(1) + s + assert isna(result[0]) + assert isna(result2[0]) + + def test_add_corner_cases(self, datetime_series): + empty = Series([], index=Index([]), dtype=np.float64) + + result = datetime_series + empty + assert np.isnan(result).all() + + result = empty + empty.copy() + assert len(result) == 0 + + # FIXME: dont leave commented-out + # TODO: this returned NotImplemented earlier, what to do? + # deltas = Series([timedelta(1)] * 5, index=np.arange(5)) + # sub_deltas = deltas[::2] + # deltas5 = deltas * 5 + # deltas = deltas + sub_deltas + + # float + int + int_ts = datetime_series.astype(int)[:-5] + added = datetime_series + int_ts + expected = Series( + datetime_series.values[:-5] + int_ts.values, + index=datetime_series.index[:-5], + name="ts", + ) + tm.assert_series_equal(added[:-5], expected) + + def test_mul_empty_int_corner_case(self): + s1 = Series([], [], dtype=np.int32) + s2 = Series({"x": 0.0}) + tm.assert_series_equal(s1 * s2, Series([np.nan], index=["x"])) + + def test_sub_datetimelike_align(self): + # GH#7500 + # datetimelike ops need to align + dt = Series(date_range("2012-1-1", periods=3, freq="D")) + dt.iloc[2] = np.nan + dt2 = dt[::-1] + + expected = Series([timedelta(0), timedelta(0), pd.NaT]) + # name is reset + result = dt2 - dt + tm.assert_series_equal(result, expected) + + expected = Series(expected, name=0) + result = (dt2.to_frame() - dt.to_frame())[0] + tm.assert_series_equal(result, expected) + # ------------------------------------------------------------------ # Comparisons @@ -130,6 +286,50 @@ def test_comparison_flex_basic(self): with pytest.raises(ValueError, match=msg): getattr(left, op)(right, axis=1) + def test_comparison_flex_alignment(self): + left = Series([1, 3, 2], index=list("abc")) + right = Series([2, 2, 2], index=list("bcd")) + + exp = pd.Series([False, False, True, False], index=list("abcd")) + tm.assert_series_equal(left.eq(right), exp) + + exp = pd.Series([True, True, False, True], index=list("abcd")) + tm.assert_series_equal(left.ne(right), exp) + + exp = pd.Series([False, False, True, False], index=list("abcd")) + tm.assert_series_equal(left.le(right), exp) + + exp = pd.Series([False, False, False, False], index=list("abcd")) + tm.assert_series_equal(left.lt(right), exp) + + exp = pd.Series([False, True, True, False], index=list("abcd")) + tm.assert_series_equal(left.ge(right), exp) + + exp = pd.Series([False, True, False, False], index=list("abcd")) + tm.assert_series_equal(left.gt(right), exp) + + def test_comparison_flex_alignment_fill(self): + left = Series([1, 3, 2], index=list("abc")) + right = Series([2, 2, 2], index=list("bcd")) + + exp = pd.Series([False, False, True, True], index=list("abcd")) + tm.assert_series_equal(left.eq(right, fill_value=2), exp) + + exp = pd.Series([True, True, False, False], index=list("abcd")) + tm.assert_series_equal(left.ne(right, fill_value=2), exp) + + exp = pd.Series([False, False, True, True], index=list("abcd")) + tm.assert_series_equal(left.le(right, fill_value=0), exp) + + exp = pd.Series([False, False, False, True], index=list("abcd")) + tm.assert_series_equal(left.lt(right, fill_value=0), exp) + + exp = pd.Series([True, True, True, False], index=list("abcd")) + tm.assert_series_equal(left.ge(right, fill_value=0), exp) + + exp = pd.Series([True, True, False, False], index=list("abcd")) + tm.assert_series_equal(left.gt(right, fill_value=0), exp) + class TestSeriesComparison: def test_comparison_different_length(self): @@ -203,3 +403,281 @@ def test_ser_cmp_result_names(self, names, op): ser = Series(cidx).rename(names[1]) result = op(ser, cidx) assert result.name == names[2] + + def test_comparisons(self): + left = np.random.randn(10) + right = np.random.randn(10) + left[:3] = np.nan + + result = nanops.nangt(left, right) + with np.errstate(invalid="ignore"): + expected = (left > right).astype("O") + expected[:3] = np.nan + + tm.assert_almost_equal(result, expected) + + s = Series(["a", "b", "c"]) + s2 = Series([False, True, False]) + + # it works! + exp = Series([False, False, False]) + tm.assert_series_equal(s == s2, exp) + tm.assert_series_equal(s2 == s, exp) + + # ----------------------------------------------------------------- + # Categorical Dtype Comparisons + + def test_categorical_comparisons(self): + # GH#8938 + # allow equality comparisons + a = Series(list("abc"), dtype="category") + b = Series(list("abc"), dtype="object") + c = Series(["a", "b", "cc"], dtype="object") + d = Series(list("acb"), dtype="object") + e = Categorical(list("abc")) + f = Categorical(list("acb")) + + # vs scalar + assert not (a == "a").all() + assert ((a != "a") == ~(a == "a")).all() + + assert not ("a" == a).all() + assert (a == "a")[0] + assert ("a" == a)[0] + assert not ("a" != a)[0] + + # vs list-like + assert (a == a).all() + assert not (a != a).all() + + assert (a == list(a)).all() + assert (a == b).all() + assert (b == a).all() + assert ((~(a == b)) == (a != b)).all() + assert ((~(b == a)) == (b != a)).all() + + assert not (a == c).all() + assert not (c == a).all() + assert not (a == d).all() + assert not (d == a).all() + + # vs a cat-like + assert (a == e).all() + assert (e == a).all() + assert not (a == f).all() + assert not (f == a).all() + + assert (~(a == e) == (a != e)).all() + assert (~(e == a) == (e != a)).all() + assert (~(a == f) == (a != f)).all() + assert (~(f == a) == (f != a)).all() + + # non-equality is not comparable + with pytest.raises(TypeError): + a < b + with pytest.raises(TypeError): + b < a + with pytest.raises(TypeError): + a > b + with pytest.raises(TypeError): + b > a + + def test_unequal_categorical_comparison_raises_type_error(self): + # unequal comparison should raise for unordered cats + cat = Series(Categorical(list("abc"))) + with pytest.raises(TypeError): + cat > "b" + + cat = Series(Categorical(list("abc"), ordered=False)) + with pytest.raises(TypeError): + cat > "b" + + # https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057 + # and following comparisons with scalars not in categories should raise + # for unequal comps, but not for equal/not equal + cat = Series(Categorical(list("abc"), ordered=True)) + + with pytest.raises(TypeError): + cat < "d" + with pytest.raises(TypeError): + cat > "d" + with pytest.raises(TypeError): + "d" < cat + with pytest.raises(TypeError): + "d" > cat + + tm.assert_series_equal(cat == "d", Series([False, False, False])) + tm.assert_series_equal(cat != "d", Series([True, True, True])) + + # ----------------------------------------------------------------- + + def test_comparison_tuples(self): + # GH#11339 + # comparisons vs tuple + s = Series([(1, 1), (1, 2)]) + + result = s == (1, 2) + expected = Series([False, True]) + tm.assert_series_equal(result, expected) + + result = s != (1, 2) + expected = Series([True, False]) + tm.assert_series_equal(result, expected) + + result = s == (0, 0) + expected = Series([False, False]) + tm.assert_series_equal(result, expected) + + result = s != (0, 0) + expected = Series([True, True]) + tm.assert_series_equal(result, expected) + + s = Series([(1, 1), (1, 1)]) + + result = s == (1, 1) + expected = Series([True, True]) + tm.assert_series_equal(result, expected) + + result = s != (1, 1) + expected = Series([False, False]) + tm.assert_series_equal(result, expected) + + s = Series([frozenset([1]), frozenset([1, 2])]) + + result = s == frozenset([1]) + expected = Series([True, False]) + tm.assert_series_equal(result, expected) + + def test_comparison_operators_with_nas(self): + ser = Series(bdate_range("1/1/2000", periods=10), dtype=object) + ser[::2] = np.nan + + # test that comparisons work + ops = ["lt", "le", "gt", "ge", "eq", "ne"] + for op in ops: + val = ser[5] + + f = getattr(operator, op) + result = f(ser, val) + + expected = f(ser.dropna(), val).reindex(ser.index) + + if op == "ne": + expected = expected.fillna(True).astype(bool) + else: + expected = expected.fillna(False).astype(bool) + + tm.assert_series_equal(result, expected) + + # FIXME: dont leave commented-out + # fffffffuuuuuuuuuuuu + # result = f(val, s) + # expected = f(val, s.dropna()).reindex(s.index) + # tm.assert_series_equal(result, expected) + + def test_ne(self): + ts = Series([3, 4, 5, 6, 7], [3, 4, 5, 6, 7], dtype=float) + expected = [True, True, False, True, True] + assert tm.equalContents(ts.index != 5, expected) + assert tm.equalContents(~(ts.index == 5), expected) + + def test_comp_ops_df_compat(self): + # GH 1134 + s1 = pd.Series([1, 2, 3], index=list("ABC"), name="x") + s2 = pd.Series([2, 2, 2], index=list("ABD"), name="x") + + s3 = pd.Series([1, 2, 3], index=list("ABC"), name="x") + s4 = pd.Series([2, 2, 2, 2], index=list("ABCD"), name="x") + + for left, right in [(s1, s2), (s2, s1), (s3, s4), (s4, s3)]: + + msg = "Can only compare identically-labeled Series objects" + with pytest.raises(ValueError, match=msg): + left == right + + with pytest.raises(ValueError, match=msg): + left != right + + with pytest.raises(ValueError, match=msg): + left < right + + msg = "Can only compare identically-labeled DataFrame objects" + with pytest.raises(ValueError, match=msg): + left.to_frame() == right.to_frame() + + with pytest.raises(ValueError, match=msg): + left.to_frame() != right.to_frame() + + with pytest.raises(ValueError, match=msg): + left.to_frame() < right.to_frame() + + def test_compare_series_interval_keyword(self): + # GH#25338 + s = Series(["IntervalA", "IntervalB", "IntervalC"]) + result = s == "IntervalA" + expected = Series([True, False, False]) + tm.assert_series_equal(result, expected) + + +# ------------------------------------------------------------------ +# Unsorted +# These arithmetic tests were previously in other files, eventually +# should be parametrized and put into tests.arithmetic + + +class TestTimeSeriesArithmetic: + # TODO: De-duplicate with test below + def test_series_add_tz_mismatch_converts_to_utc_duplicate(self): + rng = date_range("1/1/2011", periods=10, freq="H", tz="US/Eastern") + ser = Series(np.random.randn(len(rng)), index=rng) + + ts_moscow = ser.tz_convert("Europe/Moscow") + + result = ser + ts_moscow + assert result.index.tz is pytz.utc + + result = ts_moscow + ser + assert result.index.tz is pytz.utc + + def test_series_add_tz_mismatch_converts_to_utc(self): + rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") + + perm = np.random.permutation(100)[:90] + ser1 = Series( + np.random.randn(90), index=rng.take(perm).tz_convert("US/Eastern") + ) + + perm = np.random.permutation(100)[:90] + ser2 = Series( + np.random.randn(90), index=rng.take(perm).tz_convert("Europe/Berlin") + ) + + result = ser1 + ser2 + + uts1 = ser1.tz_convert("utc") + uts2 = ser2.tz_convert("utc") + expected = uts1 + uts2 + + assert result.index.tz == pytz.UTC + tm.assert_series_equal(result, expected) + + def test_series_add_aware_naive_raises(self): + rng = date_range("1/1/2011", periods=10, freq="H") + ser = Series(np.random.randn(len(rng)), index=rng) + + ser_utc = ser.tz_localize("utc") + + with pytest.raises(Exception): + ser + ser_utc + + with pytest.raises(Exception): + ser_utc + ser + + def test_datetime_understood(self): + # Ensures it doesn't fail to create the right series + # reported in issue#16726 + series = pd.Series(pd.date_range("2012-01-01", periods=3)) + offset = pd.offsets.DateOffset(days=6) + result = series - offset + expected = pd.Series(pd.to_datetime(["2011-12-26", "2011-12-27", "2011-12-28"])) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_block_internals.py b/pandas/tests/series/test_block_internals.py index 18e75c3be5bcc..d0dfbe6f5b569 100644 --- a/pandas/tests/series/test_block_internals.py +++ b/pandas/tests/series/test_block_internals.py @@ -31,8 +31,8 @@ def test_dt64tz_setitem_does_not_mutate_dti(self): ser = pd.Series(dti) assert ser._values is not dti assert ser._values._data.base is not dti._data._data.base - assert ser._data.blocks[0].values is not dti - assert ser._data.blocks[0].values._data.base is not dti._data._data.base + assert ser._mgr.blocks[0].values is not dti + assert ser._mgr.blocks[0].values._data.base is not dti._data._data.base ser[::3] = pd.NaT assert ser[0] is pd.NaT diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index adb79f69c2d81..0766bfc37d7ca 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -5,7 +5,7 @@ from pandas import Series -class TestSeriesCombine: +class TestSeriesConcat: @pytest.mark.parametrize( "dtype", ["float64", "int8", "uint8", "bool", "m8[ns]", "M8[ns]"] ) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index b0d06793dbe13..effb324298c95 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -393,7 +393,7 @@ def test_constructor_categorical_dtype(self): expected = Series( ["a", "a"], index=[0, 1], dtype=CategoricalDtype(["a", "b"], ordered=True) ) - tm.assert_series_equal(result, expected, check_categorical=True) + tm.assert_series_equal(result, expected) def test_constructor_categorical_string(self): # GH 26336: the string 'category' maintains existing CategoricalDtype @@ -626,7 +626,7 @@ def test_constructor_limit_copies(self, index): s = pd.Series(index) # we make 1 copy; this is just a smoke test here - assert s._data.blocks[0].values is not index + assert s._mgr.blocks[0].values is not index def test_constructor_pass_none(self): with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): @@ -1115,7 +1115,7 @@ def create_data(constructor): tm.assert_series_equal(result_datetime, expected) tm.assert_series_equal(result_Timestamp, expected) - def test_contructor_dict_tuple_indexer(self): + def test_constructor_dict_tuple_indexer(self): # GH 12948 data = {(1, 1, None): -1.0} result = Series(data) @@ -1124,9 +1124,9 @@ def test_contructor_dict_tuple_indexer(self): ) tm.assert_series_equal(result, expected) - def test_constructor_mapping(self, non_mapping_dict_subclass): + def test_constructor_mapping(self, non_dict_mapping_subclass): # GH 29788 - ndm = non_mapping_dict_subclass({3: "three"}) + ndm = non_dict_mapping_subclass({3: "three"}) result = Series(ndm) expected = Series(["three"], index=[3]) @@ -1329,6 +1329,7 @@ def test_convert_non_ns(self): ) tm.assert_series_equal(s, Series(date_range("20130101", periods=3, freq="D"))) + # FIXME: dont leave commented-out # s = Series(np.array(['2013-01-01 00:00:01','2013-01-01 # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]')) @@ -1421,3 +1422,17 @@ def test_constructor_tz_mixed_data(self): result = Series(dt_list) expected = Series(dt_list, dtype=object) tm.assert_series_equal(result, expected) + + def test_constructor_data_aware_dtype_naive(self, tz_aware_fixture): + # GH#25843 + tz = tz_aware_fixture + result = Series([Timestamp("2019", tz=tz)], dtype="datetime64[ns]") + expected = Series([Timestamp("2019")]) + tm.assert_series_equal(result, expected) + + def test_constructor_datetime64(self): + rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") + dates = np.asarray(rng) + + series = Series(dates) + assert np.issubdtype(series.dtype, np.dtype("M8[ns]")) diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index 59ae0cd63690c..515e75b82371a 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -65,7 +65,7 @@ def get_expected(s, name): if isinstance(result, np.ndarray): if is_integer_dtype(result): result = result.astype("int64") - elif not is_list_like(result): + elif not is_list_like(result) or isinstance(result, pd.DataFrame): return result return Series(result, index=s.index, name=s.name) @@ -74,6 +74,8 @@ def compare(s, name): b = get_expected(s, prop) if not (is_list_like(a) and is_list_like(b)): assert a == b + elif isinstance(a, pd.DataFrame): + tm.assert_frame_equal(a, b) else: tm.assert_series_equal(a, b) @@ -632,15 +634,6 @@ def test_date_tz(self): tm.assert_series_equal(s.dt.date, expected) tm.assert_series_equal(s.apply(lambda x: x.date()), expected) - def test_datetime_understood(self): - # Ensures it doesn't fail to create the right series - # reported in issue#16726 - series = pd.Series(pd.date_range("2012-01-01", periods=3)) - offset = pd.offsets.DateOffset(days=6) - result = series - offset - expected = pd.Series(pd.to_datetime(["2011-12-26", "2011-12-27", "2011-12-28"])) - tm.assert_series_equal(result, expected) - def test_dt_timetz_accessor(self, tz_naive_fixture): # GH21358 tz = maybe_get_tz(tz_naive_fixture) @@ -674,3 +667,19 @@ def test_setitem_with_different_tz(self): dtype=object, ) tm.assert_series_equal(ser, expected) + + @pytest.mark.parametrize( + "input_series, expected_output", + [ + [["2020-01-01"], [[2020, 1, 3]]], + [[pd.NaT], [[np.NaN, np.NaN, np.NaN]]], + [["2019-12-31", "2019-12-29"], [[2020, 1, 2], [2019, 52, 7]]], + [["2010-01-01", pd.NaT], [[2009, 53, 5], [np.NaN, np.NaN, np.NaN]]], + ], + ) + def test_isocalendar(self, input_series, expected_output): + result = pd.to_datetime(pd.Series(input_series)).dt.isocalendar + expected_frame = pd.DataFrame( + expected_output, columns=["year", "week", "day"], dtype="UInt32" + ) + tm.assert_frame_equal(result, expected_frame) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 80a024eda7848..05e708e575a64 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -132,7 +132,7 @@ def test_astype_str_map(self, dtype, series): expected = series.map(str) tm.assert_series_equal(result, expected) - def test_astype_str_cast(self): + def test_astype_str_cast_dt64(self): # see gh-9757 ts = Series([Timestamp("2010-01-04 00:00:00")]) s = ts.astype(str) @@ -146,11 +146,14 @@ def test_astype_str_cast(self): expected = Series([str("2010-01-04 00:00:00-05:00")]) tm.assert_series_equal(s, expected) + def test_astype_str_cast_td64(self): + # see gh-9757 + td = Series([Timedelta(1, unit="d")]) - s = td.astype(str) + ser = td.astype(str) - expected = Series([str("1 days 00:00:00.000000000")]) - tm.assert_series_equal(s, expected) + expected = Series([str("1 days")]) + tm.assert_series_equal(ser, expected) def test_astype_unicode(self): # see gh-7758: A bit of magic is required to set @@ -296,24 +299,25 @@ def cmp(a, b): # array conversion tm.assert_almost_equal(np.array(s), np.array(s.values)) - # valid conversion - for valid in [ - lambda x: x.astype("category"), - lambda x: x.astype(CategoricalDtype()), - lambda x: x.astype("object").astype("category"), - lambda x: x.astype("object").astype(CategoricalDtype()), - ]: + tm.assert_series_equal(s.astype("category"), s) + tm.assert_series_equal(s.astype(CategoricalDtype()), s) - result = valid(s) - # compare series values - # internal .categories can't be compared because it is sorted - tm.assert_series_equal(result, s, check_categorical=False) + roundtrip_expected = s.cat.set_categories( + s.cat.categories.sort_values() + ).cat.remove_unused_categories() + tm.assert_series_equal( + s.astype("object").astype("category"), roundtrip_expected + ) + tm.assert_series_equal( + s.astype("object").astype(CategoricalDtype()), roundtrip_expected + ) # invalid conversion (these are NOT a dtype) msg = ( - r"invalid type for astype" + "dtype '' " + "not understood" ) + for invalid in [ lambda x: x.astype(Categorical), lambda x: x.astype("object").astype(Categorical), diff --git a/pandas/tests/series/test_duplicates.py b/pandas/tests/series/test_duplicates.py index 3513db6177951..89181a08819b1 100644 --- a/pandas/tests/series/test_duplicates.py +++ b/pandas/tests/series/test_duplicates.py @@ -47,9 +47,9 @@ def test_unique(): # GH 18051 s = Series(Categorical([])) - tm.assert_categorical_equal(s.unique(), Categorical([]), check_dtype=False) + tm.assert_categorical_equal(s.unique(), Categorical([])) s = Series(Categorical([np.nan])) - tm.assert_categorical_equal(s.unique(), Categorical([np.nan]), check_dtype=False) + tm.assert_categorical_equal(s.unique(), Categorical([np.nan])) def test_unique_data_ownership(): diff --git a/pandas/tests/series/test_internals.py b/pandas/tests/series/test_internals.py index 1566d8f36373b..51410fce7efae 100644 --- a/pandas/tests/series/test_internals.py +++ b/pandas/tests/series/test_internals.py @@ -207,7 +207,7 @@ def test_constructor_no_pandas_array(self): ser = pd.Series([1, 2, 3]) result = pd.Series(ser.array) tm.assert_series_equal(ser, result) - assert isinstance(result._data.blocks[0], IntBlock) + assert isinstance(result._mgr.blocks[0], IntBlock) def test_astype_no_pandas_dtype(self): # https://github.com/pandas-dev/pandas/pull/24866 @@ -219,20 +219,20 @@ def test_astype_no_pandas_dtype(self): def test_from_array(self): result = pd.Series(pd.array(["1H", "2H"], dtype="timedelta64[ns]")) - assert result._data.blocks[0].is_extension is False + assert result._mgr.blocks[0].is_extension is False result = pd.Series(pd.array(["2015"], dtype="datetime64[ns]")) - assert result._data.blocks[0].is_extension is False + assert result._mgr.blocks[0].is_extension is False def test_from_list_dtype(self): result = pd.Series(["1H", "2H"], dtype="timedelta64[ns]") - assert result._data.blocks[0].is_extension is False + assert result._mgr.blocks[0].is_extension is False result = pd.Series(["2015"], dtype="datetime64[ns]") - assert result._data.blocks[0].is_extension is False + assert result._mgr.blocks[0].is_extension is False -def test_hasnans_unchached_for_series(): +def test_hasnans_uncached_for_series(): # GH#19700 idx = pd.Index([0, 1]) assert idx.hasnans is False diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index bac005465034f..9e9b93a499487 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -122,22 +122,6 @@ def test_datetime64_fillna(self): ) s[2] = np.nan - # reg fillna - result = s.fillna(Timestamp("20130104")) - expected = Series( - [ - Timestamp("20130101"), - Timestamp("20130101"), - Timestamp("20130104"), - Timestamp("20130103 9:01:01"), - ] - ) - tm.assert_series_equal(result, expected) - - result = s.fillna(NaT) - expected = s - tm.assert_series_equal(result, expected) - # ffill result = s.ffill() expected = Series( @@ -177,242 +161,228 @@ def test_datetime64_fillna(self): result = s.fillna(method="backfill") tm.assert_series_equal(result, expected) - def test_datetime64_tz_fillna(self): - - for tz in ["US/Eastern", "Asia/Tokyo"]: - # DatetimeBlock - s = Series( - [ - Timestamp("2011-01-01 10:00"), - pd.NaT, - Timestamp("2011-01-03 10:00"), - pd.NaT, - ] - ) - null_loc = pd.Series([False, True, False, True]) - - result = s.fillna(pd.Timestamp("2011-01-02 10:00")) - expected = Series( - [ - Timestamp("2011-01-01 10:00"), - Timestamp("2011-01-02 10:00"), - Timestamp("2011-01-03 10:00"), - Timestamp("2011-01-02 10:00"), - ] - ) - tm.assert_series_equal(expected, result) - # check s is not changed - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz)) - expected = Series( - [ - Timestamp("2011-01-01 10:00"), - Timestamp("2011-01-02 10:00", tz=tz), - Timestamp("2011-01-03 10:00"), - Timestamp("2011-01-02 10:00", tz=tz), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna("AAA") - expected = Series( - [ - Timestamp("2011-01-01 10:00"), - "AAA", - Timestamp("2011-01-03 10:00"), - "AAA", - ], - dtype=object, - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna( - { - 1: pd.Timestamp("2011-01-02 10:00", tz=tz), - 3: pd.Timestamp("2011-01-04 10:00"), - } - ) - expected = Series( - [ - Timestamp("2011-01-01 10:00"), - Timestamp("2011-01-02 10:00", tz=tz), - Timestamp("2011-01-03 10:00"), - Timestamp("2011-01-04 10:00"), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna( - { - 1: pd.Timestamp("2011-01-02 10:00"), - 3: pd.Timestamp("2011-01-04 10:00"), - } - ) - expected = Series( - [ - Timestamp("2011-01-01 10:00"), - Timestamp("2011-01-02 10:00"), - Timestamp("2011-01-03 10:00"), - Timestamp("2011-01-04 10:00"), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - # DatetimeBlockTZ - idx = pd.DatetimeIndex( - ["2011-01-01 10:00", pd.NaT, "2011-01-03 10:00", pd.NaT], tz=tz - ) - s = pd.Series(idx) - assert s.dtype == f"datetime64[ns, {tz}]" - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna(pd.Timestamp("2011-01-02 10:00")) - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz=tz), - Timestamp("2011-01-02 10:00"), - Timestamp("2011-01-03 10:00", tz=tz), - Timestamp("2011-01-02 10:00"), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz)) - idx = pd.DatetimeIndex( - [ - "2011-01-01 10:00", - "2011-01-02 10:00", - "2011-01-03 10:00", - "2011-01-02 10:00", - ], - tz=tz, - ) - expected = Series(idx) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz).to_pydatetime()) - idx = pd.DatetimeIndex( - [ - "2011-01-01 10:00", - "2011-01-02 10:00", - "2011-01-03 10:00", - "2011-01-02 10:00", - ], - tz=tz, - ) - expected = Series(idx) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna("AAA") - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz=tz), - "AAA", - Timestamp("2011-01-03 10:00", tz=tz), - "AAA", - ], - dtype=object, - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna( - { - 1: pd.Timestamp("2011-01-02 10:00", tz=tz), - 3: pd.Timestamp("2011-01-04 10:00"), - } - ) - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz=tz), - Timestamp("2011-01-02 10:00", tz=tz), - Timestamp("2011-01-03 10:00", tz=tz), - Timestamp("2011-01-04 10:00"), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna( - { - 1: pd.Timestamp("2011-01-02 10:00", tz=tz), - 3: pd.Timestamp("2011-01-04 10:00", tz=tz), - } - ) - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz=tz), - Timestamp("2011-01-02 10:00", tz=tz), - Timestamp("2011-01-03 10:00", tz=tz), - Timestamp("2011-01-04 10:00", tz=tz), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - # filling with a naive/other zone, coerce to object - result = s.fillna(Timestamp("20130101")) - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz=tz), - Timestamp("2013-01-01"), - Timestamp("2011-01-03 10:00", tz=tz), - Timestamp("2013-01-01"), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna(Timestamp("20130101", tz="US/Pacific")) - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz=tz), - Timestamp("2013-01-01", tz="US/Pacific"), - Timestamp("2011-01-03 10:00", tz=tz), - Timestamp("2013-01-01", tz="US/Pacific"), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) + @pytest.mark.parametrize("tz", ["US/Eastern", "Asia/Tokyo"]) + def test_datetime64_tz_fillna(self, tz): + # DatetimeBlock + s = Series( + [ + Timestamp("2011-01-01 10:00"), + pd.NaT, + Timestamp("2011-01-03 10:00"), + pd.NaT, + ] + ) + null_loc = pd.Series([False, True, False, True]) + + result = s.fillna(pd.Timestamp("2011-01-02 10:00")) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00"), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-02 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + # check s is not changed + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz)) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-02 10:00", tz=tz), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna("AAA") + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + "AAA", + Timestamp("2011-01-03 10:00"), + "AAA", + ], + dtype=object, + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna( + { + 1: pd.Timestamp("2011-01-02 10:00", tz=tz), + 3: pd.Timestamp("2011-01-04 10:00"), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-04 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna( + {1: pd.Timestamp("2011-01-02 10:00"), 3: pd.Timestamp("2011-01-04 10:00")} + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00"), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-04 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + # DatetimeBlockTZ + idx = pd.DatetimeIndex( + ["2011-01-01 10:00", pd.NaT, "2011-01-03 10:00", pd.NaT], tz=tz + ) + s = pd.Series(idx) + assert s.dtype == f"datetime64[ns, {tz}]" + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna(pd.Timestamp("2011-01-02 10:00")) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2011-01-02 10:00"), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2011-01-02 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz)) + idx = pd.DatetimeIndex( + [ + "2011-01-01 10:00", + "2011-01-02 10:00", + "2011-01-03 10:00", + "2011-01-02 10:00", + ], + tz=tz, + ) + expected = Series(idx) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz).to_pydatetime()) + idx = pd.DatetimeIndex( + [ + "2011-01-01 10:00", + "2011-01-02 10:00", + "2011-01-03 10:00", + "2011-01-02 10:00", + ], + tz=tz, + ) + expected = Series(idx) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna("AAA") + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + "AAA", + Timestamp("2011-01-03 10:00", tz=tz), + "AAA", + ], + dtype=object, + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna( + { + 1: pd.Timestamp("2011-01-02 10:00", tz=tz), + 3: pd.Timestamp("2011-01-04 10:00"), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2011-01-04 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna( + { + 1: pd.Timestamp("2011-01-02 10:00", tz=tz), + 3: pd.Timestamp("2011-01-04 10:00", tz=tz), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2011-01-04 10:00", tz=tz), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + # filling with a naive/other zone, coerce to object + result = s.fillna(Timestamp("20130101")) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2013-01-01"), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2013-01-01"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + result = s.fillna(Timestamp("20130101", tz="US/Pacific")) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2013-01-01", tz="US/Pacific"), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2013-01-01", tz="US/Pacific"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(pd.isna(s), null_loc) + + def test_fillna_dt64tz_with_method(self): # with timezone # GH 15855 - df = pd.Series([pd.Timestamp("2012-11-11 00:00:00+01:00"), pd.NaT]) + ser = pd.Series([pd.Timestamp("2012-11-11 00:00:00+01:00"), pd.NaT]) exp = pd.Series( [ pd.Timestamp("2012-11-11 00:00:00+01:00"), pd.Timestamp("2012-11-11 00:00:00+01:00"), ] ) - tm.assert_series_equal(df.fillna(method="pad"), exp) + tm.assert_series_equal(ser.fillna(method="pad"), exp) - df = pd.Series([pd.NaT, pd.Timestamp("2012-11-11 00:00:00+01:00")]) + ser = pd.Series([pd.NaT, pd.Timestamp("2012-11-11 00:00:00+01:00")]) exp = pd.Series( [ pd.Timestamp("2012-11-11 00:00:00+01:00"), pd.Timestamp("2012-11-11 00:00:00+01:00"), ] ) - tm.assert_series_equal(df.fillna(method="bfill"), exp) - - def test_datetime64_non_nano_fillna(self): - # GH#27419 - ser = Series([Timestamp("2010-01-01"), pd.NaT, Timestamp("2000-01-01")]) - val = np.datetime64("1975-04-05", "ms") - - result = ser.fillna(val) - expected = Series( - [Timestamp("2010-01-01"), Timestamp("1975-04-05"), Timestamp("2000-01-01")] - ) - tm.assert_series_equal(result, expected) + tm.assert_series_equal(ser.fillna(method="bfill"), exp) def test_fillna_consistency(self): # GH 16402 @@ -448,13 +418,6 @@ def test_fillna_consistency(self): s2[1] = "foo" tm.assert_series_equal(s2, expected) - def test_where_sparse(self): - # GH#17198 make sure we dont get an AttributeError for sp_index - ser = pd.Series(pd.arrays.SparseArray([1, 2])) - result = ser.where(ser >= 2, 0) - expected = pd.Series(pd.arrays.SparseArray([0, 2])) - tm.assert_series_equal(result, expected) - def test_datetime64tz_fillna_round_issue(self): # GH 14872 @@ -493,28 +456,6 @@ def test_fillna_int(self): s.fillna(method="ffill", inplace=True) tm.assert_series_equal(s.fillna(method="ffill", inplace=False), s) - def test_fillna_raise(self): - s = Series(np.random.randint(-100, 100, 50)) - msg = '"value" parameter must be a scalar or dict, but you passed a "list"' - with pytest.raises(TypeError, match=msg): - s.fillna([1, 2]) - - msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"' - with pytest.raises(TypeError, match=msg): - s.fillna((1, 2)) - - # related GH 9217, make sure limit is an int and greater than 0 - s = Series([1, 2, 3, None]) - msg = ( - r"Cannot specify both 'value' and 'method'\.|" - r"Limit must be greater than 0|" - "Limit must be an integer" - ) - for limit in [-1, 0, 1.0, 2.0]: - for method in ["backfill", "bfill", "pad", "ffill", None]: - with pytest.raises(ValueError, match=msg): - s.fillna(1, limit=limit, method=method) - def test_categorical_nan_equality(self): cat = Series(Categorical(["a", "b", "c", np.nan])) exp = Series([True, True, True, False]) @@ -530,77 +471,6 @@ def test_categorical_nan_handling(self): s.values.codes, np.array([0, 1, -1, 0], dtype=np.int8) ) - @pytest.mark.parametrize( - "fill_value, expected_output", - [ - ("a", ["a", "a", "b", "a", "a"]), - ({1: "a", 3: "b", 4: "b"}, ["a", "a", "b", "b", "b"]), - ({1: "a"}, ["a", "a", "b", np.nan, np.nan]), - ({1: "a", 3: "b"}, ["a", "a", "b", "b", np.nan]), - (Series("a"), ["a", np.nan, "b", np.nan, np.nan]), - (Series("a", index=[1]), ["a", "a", "b", np.nan, np.nan]), - (Series({1: "a", 3: "b"}), ["a", "a", "b", "b", np.nan]), - (Series(["a", "b"], index=[3, 4]), ["a", np.nan, "b", "a", "b"]), - ], - ) - def test_fillna_categorical(self, fill_value, expected_output): - # GH 17033 - # Test fillna for a Categorical series - data = ["a", np.nan, "b", np.nan, np.nan] - s = Series(Categorical(data, categories=["a", "b"])) - exp = Series(Categorical(expected_output, categories=["a", "b"])) - tm.assert_series_equal(s.fillna(fill_value), exp) - - @pytest.mark.parametrize( - "fill_value, expected_output", - [ - (Series(["a", "b", "c", "d", "e"]), ["a", "b", "b", "d", "e"]), - (Series(["b", "d", "a", "d", "a"]), ["a", "d", "b", "d", "a"]), - ( - Series( - Categorical( - ["b", "d", "a", "d", "a"], categories=["b", "c", "d", "e", "a"] - ) - ), - ["a", "d", "b", "d", "a"], - ), - ], - ) - def test_fillna_categorical_with_new_categories(self, fill_value, expected_output): - # GH 26215 - data = ["a", np.nan, "b", np.nan, np.nan] - s = Series(Categorical(data, categories=["a", "b", "c", "d", "e"])) - exp = Series(Categorical(expected_output, categories=["a", "b", "c", "d", "e"])) - tm.assert_series_equal(s.fillna(fill_value), exp) - - def test_fillna_categorical_raise(self): - data = ["a", np.nan, "b", np.nan, np.nan] - s = Series(Categorical(data, categories=["a", "b"])) - - with pytest.raises(ValueError, match="fill value must be in categories"): - s.fillna("d") - - with pytest.raises(ValueError, match="fill value must be in categories"): - s.fillna(Series("d")) - - with pytest.raises(ValueError, match="fill value must be in categories"): - s.fillna({1: "d", 3: "a"}) - - msg = '"value" parameter must be a scalar or dict, but you passed a "list"' - with pytest.raises(TypeError, match=msg): - s.fillna(["a", "b"]) - - msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"' - with pytest.raises(TypeError, match=msg): - s.fillna(("a", "b")) - - msg = ( - '"value" parameter must be a scalar, dict ' - 'or Series, but you passed a "DataFrame"' - ) - with pytest.raises(TypeError, match=msg): - s.fillna(DataFrame({1: ["a"], 3: ["b"]})) - def test_fillna_nat(self): series = Series([0, 1, 2, iNaT], dtype="M8[ns]") @@ -743,15 +613,6 @@ def test_fillna_bug(self): expected = Series([1.0, 1.0, 3.0, 3.0, np.nan], x.index) tm.assert_series_equal(filled, expected) - def test_fillna_inplace(self): - x = Series([np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"]) - y = x.copy() - - y.fillna(value=0, inplace=True) - - expected = x.fillna(value=0) - tm.assert_series_equal(y, expected) - def test_fillna_invalid_method(self, datetime_series): try: datetime_series.fillna(method="ffil") @@ -823,7 +684,7 @@ def test_dropna_empty(self): assert len(s) == 0 # invalid axis - msg = "No axis named 1 for object type " + msg = "No axis named 1 for object type Series" with pytest.raises(ValueError, match=msg): s.dropna(axis=1) @@ -940,14 +801,6 @@ def test_dropna_preserve_name(self, datetime_series): ts.dropna(inplace=True) assert ts.name == name - def test_fill_value_when_combine_const(self): - # GH12723 - s = Series([0, 1, np.nan, 3, 4, 5]) - - exp = s.fillna(0).add(2) - res = s.add(2, fill_value=0) - tm.assert_series_equal(res, exp) - def test_series_fillna_limit(self): index = np.arange(10) s = Series(np.random.randn(10), index=index) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index bdd9f92d92d3f..1340f514e31ce 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -1,14 +1,13 @@ -from datetime import datetime, timedelta +from datetime import datetime import operator import numpy as np import pytest import pandas as pd -from pandas import Categorical, DataFrame, Index, Series, bdate_range, date_range, isna +from pandas import DataFrame, Index, Series, bdate_range import pandas._testing as tm from pandas.core import ops -import pandas.core.nanops as nanops class TestSeriesLogicalOps: @@ -519,409 +518,6 @@ def test_logical_ops_df_compat(self): tm.assert_frame_equal(s4.to_frame() | s3.to_frame(), exp_or.to_frame()) -class TestSeriesComparisons: - def test_comparisons(self): - left = np.random.randn(10) - right = np.random.randn(10) - left[:3] = np.nan - - result = nanops.nangt(left, right) - with np.errstate(invalid="ignore"): - expected = (left > right).astype("O") - expected[:3] = np.nan - - tm.assert_almost_equal(result, expected) - - s = Series(["a", "b", "c"]) - s2 = Series([False, True, False]) - - # it works! - exp = Series([False, False, False]) - tm.assert_series_equal(s == s2, exp) - tm.assert_series_equal(s2 == s, exp) - - def test_categorical_comparisons(self): - # GH 8938 - # allow equality comparisons - a = Series(list("abc"), dtype="category") - b = Series(list("abc"), dtype="object") - c = Series(["a", "b", "cc"], dtype="object") - d = Series(list("acb"), dtype="object") - e = Categorical(list("abc")) - f = Categorical(list("acb")) - - # vs scalar - assert not (a == "a").all() - assert ((a != "a") == ~(a == "a")).all() - - assert not ("a" == a).all() - assert (a == "a")[0] - assert ("a" == a)[0] - assert not ("a" != a)[0] - - # vs list-like - assert (a == a).all() - assert not (a != a).all() - - assert (a == list(a)).all() - assert (a == b).all() - assert (b == a).all() - assert ((~(a == b)) == (a != b)).all() - assert ((~(b == a)) == (b != a)).all() - - assert not (a == c).all() - assert not (c == a).all() - assert not (a == d).all() - assert not (d == a).all() - - # vs a cat-like - assert (a == e).all() - assert (e == a).all() - assert not (a == f).all() - assert not (f == a).all() - - assert (~(a == e) == (a != e)).all() - assert (~(e == a) == (e != a)).all() - assert (~(a == f) == (a != f)).all() - assert (~(f == a) == (f != a)).all() - - # non-equality is not comparable - with pytest.raises(TypeError): - a < b - with pytest.raises(TypeError): - b < a - with pytest.raises(TypeError): - a > b - with pytest.raises(TypeError): - b > a - - def test_comparison_tuples(self): - # GH11339 - # comparisons vs tuple - s = Series([(1, 1), (1, 2)]) - - result = s == (1, 2) - expected = Series([False, True]) - tm.assert_series_equal(result, expected) - - result = s != (1, 2) - expected = Series([True, False]) - tm.assert_series_equal(result, expected) - - result = s == (0, 0) - expected = Series([False, False]) - tm.assert_series_equal(result, expected) - - result = s != (0, 0) - expected = Series([True, True]) - tm.assert_series_equal(result, expected) - - s = Series([(1, 1), (1, 1)]) - - result = s == (1, 1) - expected = Series([True, True]) - tm.assert_series_equal(result, expected) - - result = s != (1, 1) - expected = Series([False, False]) - tm.assert_series_equal(result, expected) - - s = Series([frozenset([1]), frozenset([1, 2])]) - - result = s == frozenset([1]) - expected = Series([True, False]) - tm.assert_series_equal(result, expected) - - def test_comparison_operators_with_nas(self): - ser = Series(bdate_range("1/1/2000", periods=10), dtype=object) - ser[::2] = np.nan - - # test that comparisons work - ops = ["lt", "le", "gt", "ge", "eq", "ne"] - for op in ops: - val = ser[5] - - f = getattr(operator, op) - result = f(ser, val) - - expected = f(ser.dropna(), val).reindex(ser.index) - - if op == "ne": - expected = expected.fillna(True).astype(bool) - else: - expected = expected.fillna(False).astype(bool) - - tm.assert_series_equal(result, expected) - - # FIXME: dont leave commented-out - # fffffffuuuuuuuuuuuu - # result = f(val, s) - # expected = f(val, s.dropna()).reindex(s.index) - # tm.assert_series_equal(result, expected) - - def test_unequal_categorical_comparison_raises_type_error(self): - # unequal comparison should raise for unordered cats - cat = Series(Categorical(list("abc"))) - with pytest.raises(TypeError): - cat > "b" - - cat = Series(Categorical(list("abc"), ordered=False)) - with pytest.raises(TypeError): - cat > "b" - - # https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057 - # and following comparisons with scalars not in categories should raise - # for unequal comps, but not for equal/not equal - cat = Series(Categorical(list("abc"), ordered=True)) - - with pytest.raises(TypeError): - cat < "d" - with pytest.raises(TypeError): - cat > "d" - with pytest.raises(TypeError): - "d" < cat - with pytest.raises(TypeError): - "d" > cat - - tm.assert_series_equal(cat == "d", Series([False, False, False])) - tm.assert_series_equal(cat != "d", Series([True, True, True])) - - def test_ne(self): - ts = Series([3, 4, 5, 6, 7], [3, 4, 5, 6, 7], dtype=float) - expected = [True, True, False, True, True] - assert tm.equalContents(ts.index != 5, expected) - assert tm.equalContents(~(ts.index == 5), expected) - - def test_comp_ops_df_compat(self): - # GH 1134 - s1 = pd.Series([1, 2, 3], index=list("ABC"), name="x") - s2 = pd.Series([2, 2, 2], index=list("ABD"), name="x") - - s3 = pd.Series([1, 2, 3], index=list("ABC"), name="x") - s4 = pd.Series([2, 2, 2, 2], index=list("ABCD"), name="x") - - for left, right in [(s1, s2), (s2, s1), (s3, s4), (s4, s3)]: - - msg = "Can only compare identically-labeled Series objects" - with pytest.raises(ValueError, match=msg): - left == right - - with pytest.raises(ValueError, match=msg): - left != right - - with pytest.raises(ValueError, match=msg): - left < right - - msg = "Can only compare identically-labeled DataFrame objects" - with pytest.raises(ValueError, match=msg): - left.to_frame() == right.to_frame() - - with pytest.raises(ValueError, match=msg): - left.to_frame() != right.to_frame() - - with pytest.raises(ValueError, match=msg): - left.to_frame() < right.to_frame() - - def test_compare_series_interval_keyword(self): - # GH 25338 - s = Series(["IntervalA", "IntervalB", "IntervalC"]) - result = s == "IntervalA" - expected = Series([True, False, False]) - tm.assert_series_equal(result, expected) - - -class TestSeriesFlexComparisonOps: - def test_comparison_flex_alignment(self): - left = Series([1, 3, 2], index=list("abc")) - right = Series([2, 2, 2], index=list("bcd")) - - exp = pd.Series([False, False, True, False], index=list("abcd")) - tm.assert_series_equal(left.eq(right), exp) - - exp = pd.Series([True, True, False, True], index=list("abcd")) - tm.assert_series_equal(left.ne(right), exp) - - exp = pd.Series([False, False, True, False], index=list("abcd")) - tm.assert_series_equal(left.le(right), exp) - - exp = pd.Series([False, False, False, False], index=list("abcd")) - tm.assert_series_equal(left.lt(right), exp) - - exp = pd.Series([False, True, True, False], index=list("abcd")) - tm.assert_series_equal(left.ge(right), exp) - - exp = pd.Series([False, True, False, False], index=list("abcd")) - tm.assert_series_equal(left.gt(right), exp) - - def test_comparison_flex_alignment_fill(self): - left = Series([1, 3, 2], index=list("abc")) - right = Series([2, 2, 2], index=list("bcd")) - - exp = pd.Series([False, False, True, True], index=list("abcd")) - tm.assert_series_equal(left.eq(right, fill_value=2), exp) - - exp = pd.Series([True, True, False, False], index=list("abcd")) - tm.assert_series_equal(left.ne(right, fill_value=2), exp) - - exp = pd.Series([False, False, True, True], index=list("abcd")) - tm.assert_series_equal(left.le(right, fill_value=0), exp) - - exp = pd.Series([False, False, False, True], index=list("abcd")) - tm.assert_series_equal(left.lt(right, fill_value=0), exp) - - exp = pd.Series([True, True, True, False], index=list("abcd")) - tm.assert_series_equal(left.ge(right, fill_value=0), exp) - - exp = pd.Series([True, True, False, False], index=list("abcd")) - tm.assert_series_equal(left.gt(right, fill_value=0), exp) - - -class TestSeriesOperators: - def test_operators_empty_int_corner(self): - s1 = Series([], [], dtype=np.int32) - s2 = Series({"x": 0.0}) - tm.assert_series_equal(s1 * s2, Series([np.nan], index=["x"])) - - def test_ops_datetimelike_align(self): - # GH 7500 - # datetimelike ops need to align - dt = Series(date_range("2012-1-1", periods=3, freq="D")) - dt.iloc[2] = np.nan - dt2 = dt[::-1] - - expected = Series([timedelta(0), timedelta(0), pd.NaT]) - # name is reset - result = dt2 - dt - tm.assert_series_equal(result, expected) - - expected = Series(expected, name=0) - result = (dt2.to_frame() - dt.to_frame())[0] - tm.assert_series_equal(result, expected) - - def test_operators_corner(self, datetime_series): - empty = Series([], index=Index([]), dtype=np.float64) - - result = datetime_series + empty - assert np.isnan(result).all() - - result = empty + empty.copy() - assert len(result) == 0 - - # TODO: this returned NotImplemented earlier, what to do? - # deltas = Series([timedelta(1)] * 5, index=np.arange(5)) - # sub_deltas = deltas[::2] - # deltas5 = deltas * 5 - # deltas = deltas + sub_deltas - - # float + int - int_ts = datetime_series.astype(int)[:-5] - added = datetime_series + int_ts - expected = Series( - datetime_series.values[:-5] + int_ts.values, - index=datetime_series.index[:-5], - name="ts", - ) - tm.assert_series_equal(added[:-5], expected) - - pairings = [(Series.div, operator.truediv, 1), (Series.rdiv, ops.rtruediv, 1)] - for op in ["add", "sub", "mul", "pow", "truediv", "floordiv"]: - fv = 0 - lop = getattr(Series, op) - lequiv = getattr(operator, op) - rop = getattr(Series, "r" + op) - # bind op at definition time... - requiv = lambda x, y, op=op: getattr(operator, op)(y, x) - pairings.append((lop, lequiv, fv)) - pairings.append((rop, requiv, fv)) - - @pytest.mark.parametrize("op, equiv_op, fv", pairings) - def test_operators_combine(self, op, equiv_op, fv): - def _check_fill(meth, op, a, b, fill_value=0): - exp_index = a.index.union(b.index) - a = a.reindex(exp_index) - b = b.reindex(exp_index) - - amask = isna(a) - bmask = isna(b) - - exp_values = [] - for i in range(len(exp_index)): - with np.errstate(all="ignore"): - if amask[i]: - if bmask[i]: - exp_values.append(np.nan) - continue - exp_values.append(op(fill_value, b[i])) - elif bmask[i]: - if amask[i]: - exp_values.append(np.nan) - continue - exp_values.append(op(a[i], fill_value)) - else: - exp_values.append(op(a[i], b[i])) - - result = meth(a, b, fill_value=fill_value) - expected = Series(exp_values, exp_index) - tm.assert_series_equal(result, expected) - - a = Series([np.nan, 1.0, 2.0, 3.0, np.nan], index=np.arange(5)) - b = Series([np.nan, 1, np.nan, 3, np.nan, 4.0], index=np.arange(6)) - - result = op(a, b) - exp = equiv_op(a, b) - tm.assert_series_equal(result, exp) - _check_fill(op, equiv_op, a, b, fill_value=fv) - # should accept axis=0 or axis='rows' - op(a, b, axis=0) - - def test_operators_na_handling(self): - from decimal import Decimal - from datetime import date - - s = Series( - [Decimal("1.3"), Decimal("2.3")], index=[date(2012, 1, 1), date(2012, 1, 2)] - ) - - result = s + s.shift(1) - result2 = s.shift(1) + s - assert isna(result[0]) - assert isna(result2[0]) - - def test_op_duplicate_index(self): - # GH14227 - s1 = Series([1, 2], index=[1, 1]) - s2 = Series([10, 10], index=[1, 2]) - result = s1 + s2 - expected = pd.Series([11, 12, np.nan], index=[1, 1, 2]) - tm.assert_series_equal(result, expected) - - def test_divmod(self): - # GH25557 - a = Series([1, 1, 1, np.nan], index=["a", "b", "c", "d"]) - b = Series([2, np.nan, 1, np.nan], index=["a", "b", "d", "e"]) - - result = a.divmod(b) - expected = divmod(a, b) - tm.assert_series_equal(result[0], expected[0]) - tm.assert_series_equal(result[1], expected[1]) - - result = a.rdivmod(b) - expected = divmod(b, a) - tm.assert_series_equal(result[0], expected[0]) - tm.assert_series_equal(result[1], expected[1]) - - @pytest.mark.parametrize("index", [None, range(9)]) - def test_series_integer_mod(self, index): - # see gh-24396 - s1 = Series(range(1, 10)) - s2 = Series("foo", index=index) - - msg = "not all arguments converted during string formatting" - - with pytest.raises(TypeError, match=msg): - s2 % s1 - - class TestSeriesUnaryOps: # __neg__, __pos__, __inv__ diff --git a/pandas/tests/series/test_period.py b/pandas/tests/series/test_period.py index f41245c2872a7..b54c09e5750fd 100644 --- a/pandas/tests/series/test_period.py +++ b/pandas/tests/series/test_period.py @@ -20,33 +20,12 @@ def test_auto_conversion(self): ) assert series.dtype == "Period[D]" - def test_getitem(self): - assert self.series[1] == pd.Period("2000-01-02", freq="D") - - result = self.series[[2, 4]] - exp = pd.Series( - [pd.Period("2000-01-03", freq="D"), pd.Period("2000-01-05", freq="D")], - index=[2, 4], - dtype="Period[D]", - ) - tm.assert_series_equal(result, exp) - assert result.dtype == "Period[D]" - def test_isna(self): # GH 13737 s = Series([pd.Period("2011-01", freq="M"), pd.Period("NaT", freq="M")]) tm.assert_series_equal(s.isna(), Series([False, True])) tm.assert_series_equal(s.notna(), Series([True, False])) - def test_fillna(self): - # GH 13737 - s = Series([pd.Period("2011-01", freq="M"), pd.Period("NaT", freq="M")]) - - res = s.fillna(pd.Period("2012-01", freq="M")) - exp = Series([pd.Period("2011-01", freq="M"), pd.Period("2012-01", freq="M")]) - tm.assert_series_equal(res, exp) - assert res.dtype == "Period[M]" - def test_dropna(self): # GH 13737 s = Series([pd.Period("2011-01", freq="M"), pd.Period("NaT", freq="M")]) @@ -98,12 +77,6 @@ def test_intercept_astype_object(self): result = df.values.squeeze() assert (result[:, 0] == expected.values).all() - def test_align_series(self, join_type): - rng = period_range("1/1/2000", "1/1/2010", freq="A") - ts = Series(np.random.randn(len(rng)), index=rng) - - ts.align(ts[::2], join=join_type) - @pytest.mark.parametrize( "input_vals", [ diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py new file mode 100644 index 0000000000000..be9330a14f9c9 --- /dev/null +++ b/pandas/tests/series/test_reductions.py @@ -0,0 +1,11 @@ +import pandas as pd +from pandas import Series + + +def test_reductions_td64_with_nat(): + # GH#8617 + ser = Series([0, pd.NaT], dtype="m8[ns]") + exp = ser[0] + assert ser.median() == exp + assert ser.min() == exp + assert ser.max() == exp diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 64a8c4569406e..77f942a9e32ec 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -218,6 +218,25 @@ def test_index_repr_in_frame_with_nan(self): assert repr(s) == exp + def test_format_pre_1900_dates(self): + rng = date_range("1/1/1850", "1/1/1950", freq="A-DEC") + rng.format() + ts = Series(1, index=rng) + repr(ts) + + def test_series_repr_nat(self): + series = Series([0, 1000, 2000, pd.NaT.value], dtype="M8[ns]") + + result = repr(series) + expected = ( + "0 1970-01-01 00:00:00.000000\n" + "1 1970-01-01 00:00:00.000001\n" + "2 1970-01-01 00:00:00.000002\n" + "3 NaT\n" + "dtype: datetime64[ns]" + ) + assert result == expected + class TestCategoricalRepr: def test_categorical_repr_unicode(self): diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 8f06ea69f5d66..3c3108835416a 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -1,111 +1,12 @@ -from datetime import datetime, timedelta -from io import StringIO - import numpy as np import pytest -from pandas._libs.tslib import iNaT -from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime - import pandas as pd -from pandas import ( - DataFrame, - DatetimeIndex, - NaT, - Series, - Timestamp, - concat, - date_range, - timedelta_range, - to_datetime, -) +from pandas import DataFrame, DatetimeIndex, Series, date_range, timedelta_range import pandas._testing as tm -def _simple_ts(start, end, freq="D"): - rng = date_range(start, end, freq=freq) - return Series(np.random.randn(len(rng)), index=rng) - - -def assert_range_equal(left, right): - assert left.equals(right) - assert left.freq == right.freq - assert left.tz == right.tz - - class TestTimeSeries: - def test_autocorr(self, datetime_series): - # Just run the function - corr1 = datetime_series.autocorr() - - # Now run it with the lag parameter - corr2 = datetime_series.autocorr(lag=1) - - # corr() with lag needs Series of at least length 2 - if len(datetime_series) <= 2: - assert np.isnan(corr1) - assert np.isnan(corr2) - else: - assert corr1 == corr2 - - # Choose a random lag between 1 and length of Series - 2 - # and compare the result with the Series corr() function - n = 1 + np.random.randint(max(1, len(datetime_series) - 2)) - corr1 = datetime_series.corr(datetime_series.shift(n)) - corr2 = datetime_series.autocorr(lag=n) - - # corr() with lag needs Series of at least length 2 - if len(datetime_series) <= 2: - assert np.isnan(corr1) - assert np.isnan(corr2) - else: - assert corr1 == corr2 - - def test_first_last_valid(self, datetime_series): - ts = datetime_series.copy() - ts[:5] = np.NaN - - index = ts.first_valid_index() - assert index == ts.index[5] - - ts[-5:] = np.NaN - index = ts.last_valid_index() - assert index == ts.index[-6] - - ts[:] = np.nan - assert ts.last_valid_index() is None - assert ts.first_valid_index() is None - - ser = Series([], index=[], dtype=object) - assert ser.last_valid_index() is None - assert ser.first_valid_index() is None - - # GH12800 - empty = Series(dtype=object) - assert empty.last_valid_index() is None - assert empty.first_valid_index() is None - - # GH20499: its preserves freq with holes - ts.index = date_range("20110101", periods=len(ts), freq="B") - ts.iloc[1] = 1 - ts.iloc[-2] = 1 - assert ts.first_valid_index() == ts.index[1] - assert ts.last_valid_index() == ts.index[-2] - assert ts.first_valid_index().freq == ts.index.freq - assert ts.last_valid_index().freq == ts.index.freq - - def test_mpl_compat_hack(self, datetime_series): - - # This is currently failing because the test was relying on - # the DeprecationWarning coming through Index.__getitem__. - # We want to implement a warning specifically for Series.__getitem__ - # at which point this will become a Deprecation/FutureWarning - with tm.assert_produces_warning(None): - # GH#30588 multi-dimensional indexing deprecated - result = datetime_series[:, np.newaxis] - expected = datetime_series.values[:, np.newaxis] - tm.assert_almost_equal(result, expected) - def test_timeseries_coercion(self): idx = tm.makeDateIndex(10000) ser = Series(np.random.randn(len(idx)), idx.astype(object)) @@ -120,112 +21,13 @@ def test_contiguous_boolean_preserve_freq(self): masked = rng[mask] expected = rng[10:20] - assert expected.freq is not None - assert_range_equal(masked, expected) + assert expected.freq == rng.freq + tm.assert_index_equal(masked, expected) mask[22] = True masked = rng[mask] assert masked.freq is None - def test_to_datetime_unit(self): - - epoch = 1370745748 - s = Series([epoch + t for t in range(20)]) - result = to_datetime(s, unit="s") - expected = Series( - [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] - ) - tm.assert_series_equal(result, expected) - - s = Series([epoch + t for t in range(20)]).astype(float) - result = to_datetime(s, unit="s") - expected = Series( - [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] - ) - tm.assert_series_equal(result, expected) - - s = Series([epoch + t for t in range(20)] + [iNaT]) - result = to_datetime(s, unit="s") - expected = Series( - [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] - + [NaT] - ) - tm.assert_series_equal(result, expected) - - s = Series([epoch + t for t in range(20)] + [iNaT]).astype(float) - result = to_datetime(s, unit="s") - expected = Series( - [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] - + [NaT] - ) - tm.assert_series_equal(result, expected) - - # GH13834 - s = Series([epoch + t for t in np.arange(0, 2, 0.25)] + [iNaT]).astype(float) - result = to_datetime(s, unit="s") - expected = Series( - [ - Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) - for t in np.arange(0, 2, 0.25) - ] - + [NaT] - ) - tm.assert_series_equal(result, expected) - - s = concat( - [Series([epoch + t for t in range(20)]).astype(float), Series([np.nan])], - ignore_index=True, - ) - result = to_datetime(s, unit="s") - expected = Series( - [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] - + [NaT] - ) - tm.assert_series_equal(result, expected) - - result = to_datetime([1, 2, "NaT", pd.NaT, np.nan], unit="D") - expected = DatetimeIndex( - [Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 3 - ) - tm.assert_index_equal(result, expected) - - msg = "non convertible value foo with the unit 'D'" - with pytest.raises(ValueError, match=msg): - to_datetime([1, 2, "foo"], unit="D") - msg = "cannot convert input 111111111 with the unit 'D'" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime([1, 2, 111111111], unit="D") - - # coerce we can process - expected = DatetimeIndex( - [Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 1 - ) - result = to_datetime([1, 2, "foo"], unit="D", errors="coerce") - tm.assert_index_equal(result, expected) - - result = to_datetime([1, 2, 111111111], unit="D", errors="coerce") - tm.assert_index_equal(result, expected) - - def test_series_ctor_datetime64(self): - rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") - dates = np.asarray(rng) - - series = Series(dates) - assert np.issubdtype(series.dtype, np.dtype("M8[ns]")) - - def test_series_repr_nat(self): - series = Series([0, 1000, 2000, iNaT], dtype="M8[ns]") - - result = repr(series) - expected = ( - "0 1970-01-01 00:00:00.000000\n" - "1 1970-01-01 00:00:00.000001\n" - "2 1970-01-01 00:00:00.000002\n" - "3 NaT\n" - "dtype: datetime64[ns]" - ) - assert result == expected - def test_promote_datetime_date(self): rng = date_range("1/1/2000", periods=20) ts = Series(np.random.randn(20), index=rng) @@ -249,101 +51,6 @@ def test_promote_datetime_date(self): expected = rng.get_indexer(ts_slice.index) tm.assert_numpy_array_equal(result, expected) - def test_first_subset(self): - ts = _simple_ts("1/1/2000", "1/1/2010", freq="12h") - result = ts.first("10d") - assert len(result) == 20 - - ts = _simple_ts("1/1/2000", "1/1/2010") - result = ts.first("10d") - assert len(result) == 10 - - result = ts.first("3M") - expected = ts[:"3/31/2000"] - tm.assert_series_equal(result, expected) - - result = ts.first("21D") - expected = ts[:21] - tm.assert_series_equal(result, expected) - - result = ts[:0].first("3M") - tm.assert_series_equal(result, ts[:0]) - - def test_first_raises(self): - # GH20725 - ser = pd.Series("a b c".split()) - msg = "'first' only supports a DatetimeIndex index" - with pytest.raises(TypeError, match=msg): - ser.first("1D") - - def test_last_subset(self): - ts = _simple_ts("1/1/2000", "1/1/2010", freq="12h") - result = ts.last("10d") - assert len(result) == 20 - - ts = _simple_ts("1/1/2000", "1/1/2010") - result = ts.last("10d") - assert len(result) == 10 - - result = ts.last("21D") - expected = ts["12/12/2009":] - tm.assert_series_equal(result, expected) - - result = ts.last("21D") - expected = ts[-21:] - tm.assert_series_equal(result, expected) - - result = ts[:0].last("3M") - tm.assert_series_equal(result, ts[:0]) - - def test_last_raises(self): - # GH20725 - ser = pd.Series("a b c".split()) - msg = "'last' only supports a DatetimeIndex index" - with pytest.raises(TypeError, match=msg): - ser.last("1D") - - def test_format_pre_1900_dates(self): - rng = date_range("1/1/1850", "1/1/1950", freq="A-DEC") - rng.format() - ts = Series(1, index=rng) - repr(ts) - - def test_to_period(self): - from pandas.core.indexes.period import period_range - - ts = _simple_ts("1/1/2000", "1/1/2001") - - pts = ts.to_period() - exp = ts.copy() - exp.index = period_range("1/1/2000", "1/1/2001") - tm.assert_series_equal(pts, exp) - - pts = ts.to_period("M") - exp.index = exp.index.asfreq("M") - tm.assert_index_equal(pts.index, exp.index.asfreq("M")) - tm.assert_series_equal(pts, exp) - - # GH 7606 without freq - idx = DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"]) - exp_idx = pd.PeriodIndex( - ["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], freq="D" - ) - - s = Series(np.random.randn(4), index=idx) - expected = s.copy() - expected.index = exp_idx - tm.assert_series_equal(s.to_period(), expected) - - df = DataFrame(np.random.randn(4, 4), index=idx, columns=idx) - expected = df.copy() - expected.index = exp_idx - tm.assert_frame_equal(df.to_period(), expected) - - expected = df.copy() - expected.columns = exp_idx - tm.assert_frame_equal(df.to_period(axis=1), expected) - def test_groupby_count_dateparseerror(self): dr = date_range(start="1/1/2012", freq="5min", periods=10) @@ -358,15 +65,6 @@ def test_groupby_count_dateparseerror(self): tm.assert_series_equal(result, expected) - def test_to_csv_numpy_16_bug(self): - frame = DataFrame({"a": date_range("1/1/2000", periods=10)}) - - buf = StringIO() - frame.to_csv(buf) - - result = buf.getvalue() - assert "2000-01-01" in result - def test_series_map_box_timedelta(self): # GH 11349 s = Series(timedelta_range("1 day 1 s", periods=5, freq="h")) @@ -396,82 +94,6 @@ def test_asfreq_resample_set_correct_freq(self): # does .resample() set .freq correctly? assert df.resample("D").asfreq().index.freq == "D" - def test_pickle(self): - - # GH4606 - p = tm.round_trip_pickle(NaT) - assert p is NaT - - idx = pd.to_datetime(["2013-01-01", NaT, "2014-01-06"]) - idx_p = tm.round_trip_pickle(idx) - assert idx_p[0] == idx[0] - assert idx_p[1] is NaT - assert idx_p[2] == idx[2] - - # GH11002 - # don't infer freq - idx = date_range("1750-1-1", "2050-1-1", freq="7D") - idx_p = tm.round_trip_pickle(idx) - tm.assert_index_equal(idx, idx_p) - - @pytest.mark.parametrize("tz", [None, "Asia/Tokyo", "US/Eastern"]) - def test_setops_preserve_freq(self, tz): - rng = date_range("1/1/2000", "1/1/2002", name="idx", tz=tz) - - result = rng[:50].union(rng[50:100]) - assert result.name == rng.name - assert result.freq == rng.freq - assert result.tz == rng.tz - - result = rng[:50].union(rng[30:100]) - assert result.name == rng.name - assert result.freq == rng.freq - assert result.tz == rng.tz - - result = rng[:50].union(rng[60:100]) - assert result.name == rng.name - assert result.freq is None - assert result.tz == rng.tz - - result = rng[:50].intersection(rng[25:75]) - assert result.name == rng.name - assert result.freqstr == "D" - assert result.tz == rng.tz - - nofreq = DatetimeIndex(list(rng[25:75]), name="other") - result = rng[:50].union(nofreq) - assert result.name is None - assert result.freq == rng.freq - assert result.tz == rng.tz - - result = rng[:50].intersection(nofreq) - assert result.name is None - assert result.freq == rng.freq - assert result.tz == rng.tz - - def test_from_M8_structured(self): - dates = [(datetime(2012, 9, 9, 0, 0), datetime(2012, 9, 8, 15, 10))] - arr = np.array(dates, dtype=[("Date", "M8[us]"), ("Forecasting", "M8[us]")]) - df = DataFrame(arr) - - assert df["Date"][0] == dates[0][0] - assert df["Forecasting"][0] == dates[0][1] - - s = Series(arr["Date"]) - assert isinstance(s[0], Timestamp) - assert s[0] == dates[0][0] - - def test_get_level_values_box(self): - from pandas import MultiIndex - - dates = date_range("1/1/2000", periods=4) - levels = [dates, [0, 1]] - codes = [[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]] - - index = MultiIndex(levels=levels, codes=codes) - - assert isinstance(index.get_level_values(0)[0], Timestamp) - def test_view_tz(self): # GH#24024 ser = pd.Series(pd.date_range("2000", periods=4, tz="US/Central")) @@ -486,6 +108,19 @@ def test_view_tz(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("tz", [None, "US/Central"]) + def test_asarray_object_dt64(self, tz): + ser = pd.Series(pd.date_range("2000", periods=2, tz=tz)) + + with tm.assert_produces_warning(None): + # Future behavior (for tzaware case) with no warning + result = np.asarray(ser, dtype=object) + + expected = np.array( + [pd.Timestamp("2000-01-01", tz=tz), pd.Timestamp("2000-01-02", tz=tz)] + ) + tm.assert_numpy_array_equal(result, expected) + def test_asarray_tz_naive(self): # This shouldn't produce a warning. ser = pd.Series(pd.date_range("2000", periods=2)) @@ -494,12 +129,6 @@ def test_asarray_tz_naive(self): tm.assert_numpy_array_equal(result, expected) - # optionally, object - result = np.asarray(ser, dtype=object) - - expected = np.array([pd.Timestamp("2000-01-01"), pd.Timestamp("2000-01-02")]) - tm.assert_numpy_array_equal(result, expected) - def test_asarray_tz_aware(self): tz = "US/Central" ser = pd.Series(pd.date_range("2000", periods=2, tz=tz)) @@ -512,11 +141,3 @@ def test_asarray_tz_aware(self): result = np.asarray(ser, dtype="M8[ns]") tm.assert_numpy_array_equal(result, expected) - - # Future behavior with no warning - expected = np.array( - [pd.Timestamp("2000-01-01", tz=tz), pd.Timestamp("2000-01-02", tz=tz)] - ) - result = np.asarray(ser, dtype=object) - - tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/series/test_timezones.py b/pandas/tests/series/test_timezones.py index e729ff91293a8..05792dc4f00d2 100644 --- a/pandas/tests/series/test_timezones.py +++ b/pandas/tests/series/test_timezones.py @@ -6,11 +6,8 @@ from dateutil.tz import tzoffset import numpy as np import pytest -import pytz -from pandas._libs.tslibs import conversion, timezones - -from pandas import Series, Timestamp +from pandas import Series import pandas._testing as tm from pandas.core.indexes.datetimes import date_range @@ -30,85 +27,6 @@ def test_dateutil_tzoffset_support(self): # it works! #2443 repr(series.index[0]) - @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) - def test_string_index_alias_tz_aware(self, tz): - rng = date_range("1/1/2000", periods=10, tz=tz) - ser = Series(np.random.randn(len(rng)), index=rng) - - result = ser["1/3/2000"] - tm.assert_almost_equal(result, ser[2]) - - # TODO: De-duplicate with test below - def test_series_add_tz_mismatch_converts_to_utc_duplicate(self): - rng = date_range("1/1/2011", periods=10, freq="H", tz="US/Eastern") - ser = Series(np.random.randn(len(rng)), index=rng) - - ts_moscow = ser.tz_convert("Europe/Moscow") - - result = ser + ts_moscow - assert result.index.tz is pytz.utc - - result = ts_moscow + ser - assert result.index.tz is pytz.utc - - def test_series_add_tz_mismatch_converts_to_utc(self): - rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") - - perm = np.random.permutation(100)[:90] - ser1 = Series( - np.random.randn(90), index=rng.take(perm).tz_convert("US/Eastern") - ) - - perm = np.random.permutation(100)[:90] - ser2 = Series( - np.random.randn(90), index=rng.take(perm).tz_convert("Europe/Berlin") - ) - - result = ser1 + ser2 - - uts1 = ser1.tz_convert("utc") - uts2 = ser2.tz_convert("utc") - expected = uts1 + uts2 - - assert result.index.tz == pytz.UTC - tm.assert_series_equal(result, expected) - - def test_series_add_aware_naive_raises(self): - rng = date_range("1/1/2011", periods=10, freq="H") - ser = Series(np.random.randn(len(rng)), index=rng) - - ser_utc = ser.tz_localize("utc") - - with pytest.raises(Exception): - ser + ser_utc - - with pytest.raises(Exception): - ser_utc + ser - - def test_series_align_aware(self): - idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern") - ser = Series(np.random.randn(len(idx1)), index=idx1) - ser_central = ser.tz_convert("US/Central") - # # different timezones convert to UTC - - new1, new2 = ser.align(ser_central) - assert new1.index.tz == pytz.UTC - assert new2.index.tz == pytz.UTC - - @pytest.mark.parametrize("tzstr", ["Europe/Berlin", "dateutil/Europe/Berlin"]) - def test_getitem_pydatetime_tz(self, tzstr): - tz = timezones.maybe_get_tz(tzstr) - - index = date_range( - start="2012-12-24 16:00", end="2012-12-24 18:00", freq="H", tz=tzstr - ) - ts = Series(index=index, data=index.hour) - time_pandas = Timestamp("2012-12-24 17:00", tz=tzstr) - - dt = datetime(2012, 12, 24, 17, 0) - time_datetime = conversion.localize_pydatetime(dt, tz) - assert ts[time_pandas] == ts[time_datetime] - @pytest.mark.parametrize("copy", [True, False]) @pytest.mark.parametrize( "method, tz", [["tz_localize", None], ["tz_convert", "Europe/Berlin"]] @@ -123,10 +41,3 @@ def test_tz_localize_convert_copy_inplace_mutate(self, copy, method, tz): np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz=tz) ) tm.assert_series_equal(result, expected) - - def test_constructor_data_aware_dtype_naive(self, tz_aware_fixture): - # GH 25843 - tz = tz_aware_fixture - result = Series([Timestamp("2019", tz=tz)], dtype="datetime64[ns]") - expected = Series([Timestamp("2019")]) - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index 536f15ea75d69..c7fc37a278e83 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -171,7 +171,7 @@ def test_binary_ufunc_scalar(ufunc, sparse, flip, arrays_for_binary_ufunc): @pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) @pytest.mark.parametrize("shuffle", SHUFFLE) @pytest.mark.filterwarnings("ignore:divide by zero:RuntimeWarning") -def test_multiple_ouput_binary_ufuncs(ufunc, sparse, shuffle, arrays_for_binary_ufunc): +def test_multiple_output_binary_ufuncs(ufunc, sparse, shuffle, arrays_for_binary_ufunc): # Test that # the same conditions from binary_ufunc_scalar apply to # ufuncs with multiple outputs. @@ -204,7 +204,7 @@ def test_multiple_ouput_binary_ufuncs(ufunc, sparse, shuffle, arrays_for_binary_ @pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) -def test_multiple_ouput_ufunc(sparse, arrays_for_binary_ufunc): +def test_multiple_output_ufunc(sparse, arrays_for_binary_ufunc): # Test that the same conditions from unary input apply to multi-output # ufuncs array, _ = arrays_for_binary_ufunc diff --git a/pandas/tests/series/test_validate.py b/pandas/tests/series/test_validate.py index 511d24ca7fa29..e2f050650b298 100644 --- a/pandas/tests/series/test_validate.py +++ b/pandas/tests/series/test_validate.py @@ -3,7 +3,15 @@ @pytest.mark.parametrize( "func", - ["reset_index", "_set_name", "sort_values", "sort_index", "rename", "dropna"], + [ + "reset_index", + "_set_name", + "sort_values", + "sort_index", + "rename", + "dropna", + "drop_duplicates", + ], ) @pytest.mark.parametrize("inplace", [1, "True", [1, 2, 3], 5.0]) def test_validate_bool_args(string_series, func, inplace): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index a1de9c435c9ba..ad7028702ec8c 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -760,6 +760,16 @@ def test_categorical_from_codes(self): result = algos.isin(Sd, St) tm.assert_numpy_array_equal(expected, result) + def test_categorical_isin(self): + vals = np.array([0, 1, 2, 0]) + cats = ["a", "b", "c"] + cat = Categorical(1).from_codes(vals, cats) + other = Categorical(1).from_codes(np.array([0, 1]), cats) + + expected = np.array([True, True, False, True]) + result = algos.isin(cat, other) + tm.assert_numpy_array_equal(expected, result) + def test_same_nan_is_in(self): # GH 22160 # nan is special, because from " a is b" doesn't follow "a == b" diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 186c735a0bff9..bcfed2d0d3a10 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas.compat.numpy import _np_version_under1p17 + import pandas as pd from pandas import Series, Timestamp from pandas.core import ops @@ -59,8 +61,31 @@ def test_random_state(): # check with no arg random state assert com.random_state() is np.random + # check array-like + # GH32503 + state_arr_like = npr.randint(0, 2 ** 31, size=624, dtype="uint32") + assert ( + com.random_state(state_arr_like).uniform() + == npr.RandomState(state_arr_like).uniform() + ) + + # Check BitGenerators + # GH32503 + if not _np_version_under1p17: + assert ( + com.random_state(npr.MT19937(3)).uniform() + == npr.RandomState(npr.MT19937(3)).uniform() + ) + assert ( + com.random_state(npr.PCG64(11)).uniform() + == npr.RandomState(npr.PCG64(11)).uniform() + ) + # Error for floats or strings - msg = "random_state must be an integer, a numpy RandomState, or None" + msg = ( + "random_state must be an integer, array-like, a BitGenerator, " + "a numpy RandomState, or None" + ) with pytest.raises(ValueError, match=msg): com.random_state("test") diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index b2a85b539fd86..57542aa3bc7f6 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -8,6 +8,8 @@ import numpy as np # noqa import pytest +import pandas.util._test_decorators as td + from pandas import DataFrame import pandas._testing as tm @@ -47,6 +49,19 @@ def test_xarray(df): assert df.to_xarray() is not None +@td.skip_if_no("cftime") +@td.skip_if_no("xarray", "0.10.4") +def test_xarray_cftimeindex_nearest(): + # https://github.com/pydata/xarray/issues/3751 + import cftime + import xarray + + times = xarray.cftime_range("0001", periods=2) + result = times.get_loc(cftime.DatetimeGregorian(2000, 1, 1), method="nearest") + expected = 1 + assert result == expected + + def test_oo_optimizable(): # GH 21071 subprocess.check_call([sys.executable, "-OO", "-c", "import pandas"]) @@ -107,7 +122,6 @@ def test_pandas_datareader(): # importing from pandas, Cython import warning @pytest.mark.filterwarnings("ignore:can't resolve:ImportWarning") -@pytest.mark.skip(reason="Anaconda installation issue - GH32144") def test_geopandas(): geopandas = import_module("geopandas") # noqa diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index d914cf873de24..b6f59807eaa15 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -3,6 +3,7 @@ from pandas._libs import lib, writers as libwriters +import pandas as pd from pandas import Index import pandas._testing as tm @@ -39,6 +40,11 @@ def test_fast_unique_multiple_list_gen_sort(self): out = lib.fast_unique_multiple_list_gen(gen, sort=False) tm.assert_numpy_array_equal(np.array(out), expected) + def test_fast_unique_multiple_unsortable_runtimewarning(self): + arr = [np.array(["foo", pd.Timestamp("2000")])] + with tm.assert_produces_warning(RuntimeWarning): + lib.fast_unique_multiple(arr, sort=None) + class TestIndexing: def test_maybe_indices_to_slice_left_edge(self): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index efaedfad1e093..dd0bac683c35c 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -6,12 +6,11 @@ import numpy as np from numpy.random import randn import pytest -import pytz from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, isna +from pandas import DataFrame, Index, MultiIndex, Series, Timestamp import pandas._testing as tm AGG_FUNCTIONS = [ @@ -80,52 +79,6 @@ def test_append(self): result = a["A"].append(b["A"]) tm.assert_series_equal(result, self.frame["A"]) - def test_append_index(self): - idx1 = Index([1.1, 1.2, 1.3]) - idx2 = pd.date_range("2011-01-01", freq="D", periods=3, tz="Asia/Tokyo") - idx3 = Index(["A", "B", "C"]) - - midx_lv2 = MultiIndex.from_arrays([idx1, idx2]) - midx_lv3 = MultiIndex.from_arrays([idx1, idx2, idx3]) - - result = idx1.append(midx_lv2) - - # see gh-7112 - tz = pytz.timezone("Asia/Tokyo") - expected_tuples = [ - (1.1, tz.localize(datetime.datetime(2011, 1, 1))), - (1.2, tz.localize(datetime.datetime(2011, 1, 2))), - (1.3, tz.localize(datetime.datetime(2011, 1, 3))), - ] - expected = Index([1.1, 1.2, 1.3] + expected_tuples) - tm.assert_index_equal(result, expected) - - result = midx_lv2.append(idx1) - expected = Index(expected_tuples + [1.1, 1.2, 1.3]) - tm.assert_index_equal(result, expected) - - result = midx_lv2.append(midx_lv2) - expected = MultiIndex.from_arrays([idx1.append(idx1), idx2.append(idx2)]) - tm.assert_index_equal(result, expected) - - result = midx_lv2.append(midx_lv3) - tm.assert_index_equal(result, expected) - - result = midx_lv3.append(midx_lv2) - expected = Index._simple_new( - np.array( - [ - (1.1, tz.localize(datetime.datetime(2011, 1, 1)), "A"), - (1.2, tz.localize(datetime.datetime(2011, 1, 2)), "B"), - (1.3, tz.localize(datetime.datetime(2011, 1, 3)), "C"), - ] - + expected_tuples, - dtype=object, - ), - None, - ) - tm.assert_index_equal(result, expected) - def test_dataframe_constructor(self): multi = DataFrame( np.random.randn(4, 4), @@ -295,6 +248,34 @@ def _check_counts(frame, axis=0): result = self.frame.count(level=0, numeric_only=True) tm.assert_index_equal(result.columns, Index(list("ABC"), name="exp")) + def test_count_index_with_nan(self): + # https://github.com/pandas-dev/pandas/issues/21824 + df = DataFrame( + { + "Person": ["John", "Myla", None, "John", "Myla"], + "Age": [24.0, 5, 21.0, 33, 26], + "Single": [False, True, True, True, False], + } + ) + + # count on row labels + res = df.set_index(["Person", "Single"]).count(level="Person") + expected = DataFrame( + index=Index(["John", "Myla"], name="Person"), + columns=Index(["Age"]), + data=[2, 2], + ) + tm.assert_frame_equal(res, expected) + + # count on column labels + res = df.set_index(["Person", "Single"]).T.count(level="Person", axis=1) + expected = DataFrame( + columns=Index(["John", "Myla"], name="Person"), + index=Index(["Age"]), + data=[[2, 2]], + ) + tm.assert_frame_equal(res, expected) + def test_count_level_series(self): index = MultiIndex( levels=[["foo", "bar", "baz"], ["one", "two", "three", "four"]], @@ -986,25 +967,6 @@ def test_swaplevel(self): with pytest.raises(TypeError, match=msg): DataFrame(range(3)).swaplevel() - def test_reorder_levels(self): - result = self.ymd.reorder_levels(["month", "day", "year"]) - expected = self.ymd.swaplevel(0, 1).swaplevel(1, 2) - tm.assert_frame_equal(result, expected) - - result = self.ymd["A"].reorder_levels(["month", "day", "year"]) - expected = self.ymd["A"].swaplevel(0, 1).swaplevel(1, 2) - tm.assert_series_equal(result, expected) - - result = self.ymd.T.reorder_levels(["month", "day", "year"], axis=1) - expected = self.ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1) - tm.assert_frame_equal(result, expected) - - with pytest.raises(TypeError, match="hierarchical axis"): - self.ymd.reorder_levels([1, 2], axis=1) - - with pytest.raises(IndexError, match="Too many levels"): - self.ymd.index.reorder_levels([1, 2, 3]) - def test_insert_index(self): df = self.ymd[:5].T df[2000, 1, 10] = df[2000, 1, 7] @@ -1265,43 +1227,6 @@ def test_unstack_group_index_overflow(self): result = s.unstack(4) assert result.shape == (500, 2) - def test_pyint_engine(self): - # GH 18519 : when combinations of codes cannot be represented in 64 - # bits, the index underlying the MultiIndex engine works with Python - # integers, rather than uint64. - N = 5 - keys = [ - tuple(l) - for l in [ - [0] * 10 * N, - [1] * 10 * N, - [2] * 10 * N, - [np.nan] * N + [2] * 9 * N, - [0] * N + [2] * 9 * N, - [np.nan] * N + [2] * 8 * N + [0] * N, - ] - ] - # Each level contains 4 elements (including NaN), so it is represented - # in 2 bits, for a total of 2*N*10 = 100 > 64 bits. If we were using a - # 64 bit engine and truncating the first levels, the fourth and fifth - # keys would collide; if truncating the last levels, the fifth and - # sixth; if rotating bits rather than shifting, the third and fifth. - - for idx in range(len(keys)): - index = MultiIndex.from_tuples(keys) - assert index.get_loc(keys[idx]) == idx - - expected = np.arange(idx + 1, dtype=np.intp) - result = index.get_indexer([keys[i] for i in expected]) - tm.assert_numpy_array_equal(result, expected) - - # With missing key: - idces = range(len(keys)) - expected = np.array([-1] + list(idces), dtype=np.intp) - missing = tuple([0, 1] * 5 * N) - result = index.get_indexer([missing] + [keys[i] for i in idces]) - tm.assert_numpy_array_equal(result, expected) - def test_to_html(self): self.ymd.columns.name = "foo" self.ymd.to_html() @@ -1355,92 +1280,6 @@ def test_level_with_tuples(self): tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) - def test_mixed_depth_drop(self): - arrays = [ - ["a", "top", "top", "routine1", "routine1", "routine2"], - ["", "OD", "OD", "result1", "result2", "result1"], - ["", "wx", "wy", "", "", ""], - ] - - tuples = sorted(zip(*arrays)) - index = MultiIndex.from_tuples(tuples) - df = DataFrame(randn(4, 6), columns=index) - - result = df.drop("a", axis=1) - expected = df.drop([("a", "", "")], axis=1) - tm.assert_frame_equal(expected, result) - - result = df.drop(["top"], axis=1) - expected = df.drop([("top", "OD", "wx")], axis=1) - expected = expected.drop([("top", "OD", "wy")], axis=1) - tm.assert_frame_equal(expected, result) - - result = df.drop(("top", "OD", "wx"), axis=1) - expected = df.drop([("top", "OD", "wx")], axis=1) - tm.assert_frame_equal(expected, result) - - expected = df.drop([("top", "OD", "wy")], axis=1) - expected = df.drop("top", axis=1) - - result = df.drop("result1", level=1, axis=1) - expected = df.drop( - [("routine1", "result1", ""), ("routine2", "result1", "")], axis=1 - ) - tm.assert_frame_equal(expected, result) - - def test_drop_multiindex_other_level_nan(self): - # GH 12754 - df = ( - DataFrame( - { - "A": ["one", "one", "two", "two"], - "B": [np.nan, 0.0, 1.0, 2.0], - "C": ["a", "b", "c", "c"], - "D": [1, 2, 3, 4], - } - ) - .set_index(["A", "B", "C"]) - .sort_index() - ) - result = df.drop("c", level="C") - expected = DataFrame( - [2, 1], - columns=["D"], - index=pd.MultiIndex.from_tuples( - [("one", 0.0, "b"), ("one", np.nan, "a")], names=["A", "B", "C"] - ), - ) - tm.assert_frame_equal(result, expected) - - def test_drop_nonunique(self): - df = DataFrame( - [ - ["x-a", "x", "a", 1.5], - ["x-a", "x", "a", 1.2], - ["z-c", "z", "c", 3.1], - ["x-a", "x", "a", 4.1], - ["x-b", "x", "b", 5.1], - ["x-b", "x", "b", 4.1], - ["x-b", "x", "b", 2.2], - ["y-a", "y", "a", 1.2], - ["z-b", "z", "b", 2.1], - ], - columns=["var1", "var2", "var3", "var4"], - ) - - grp_size = df.groupby("var1").size() - drop_idx = grp_size.loc[grp_size == 1] - - idf = df.set_index(["var1", "var2", "var3"]) - - # it works! #2101 - result = idf.drop(drop_idx.index, level=0).reset_index() - expected = df[-df.var1.isin(drop_idx.index)] - - result.index = expected.index - - tm.assert_frame_equal(result, expected) - def test_mixed_depth_pop(self): arrays = [ ["a", "top", "top", "routine1", "routine1", "routine2"], @@ -1483,78 +1322,6 @@ def test_reindex_level_partial_selection(self): result = self.frame.T.loc[:, ["foo", "qux"]] tm.assert_frame_equal(result, expected.T) - def test_drop_level(self): - result = self.frame.drop(["bar", "qux"], level="first") - expected = self.frame.iloc[[0, 1, 2, 5, 6]] - tm.assert_frame_equal(result, expected) - - result = self.frame.drop(["two"], level="second") - expected = self.frame.iloc[[0, 2, 3, 6, 7, 9]] - tm.assert_frame_equal(result, expected) - - result = self.frame.T.drop(["bar", "qux"], axis=1, level="first") - expected = self.frame.iloc[[0, 1, 2, 5, 6]].T - tm.assert_frame_equal(result, expected) - - result = self.frame.T.drop(["two"], axis=1, level="second") - expected = self.frame.iloc[[0, 2, 3, 6, 7, 9]].T - tm.assert_frame_equal(result, expected) - - def test_drop_level_nonunique_datetime(self): - # GH 12701 - idx = Index([2, 3, 4, 4, 5], name="id") - idxdt = pd.to_datetime( - [ - "201603231400", - "201603231500", - "201603231600", - "201603231600", - "201603231700", - ] - ) - df = DataFrame(np.arange(10).reshape(5, 2), columns=list("ab"), index=idx) - df["tstamp"] = idxdt - df = df.set_index("tstamp", append=True) - ts = Timestamp("201603231600") - assert df.index.is_unique is False - - result = df.drop(ts, level="tstamp") - expected = df.loc[idx != 4] - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("box", [Series, DataFrame]) - def test_drop_tz_aware_timestamp_across_dst(self, box): - # GH 21761 - start = Timestamp("2017-10-29", tz="Europe/Berlin") - end = Timestamp("2017-10-29 04:00:00", tz="Europe/Berlin") - index = pd.date_range(start, end, freq="15min") - data = box(data=[1] * len(index), index=index) - result = data.drop(start) - expected_start = Timestamp("2017-10-29 00:15:00", tz="Europe/Berlin") - expected_idx = pd.date_range(expected_start, end, freq="15min") - expected = box(data=[1] * len(expected_idx), index=expected_idx) - tm.assert_equal(result, expected) - - def test_drop_preserve_names(self): - index = MultiIndex.from_arrays( - [[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]], names=["one", "two"] - ) - - df = DataFrame(np.random.randn(6, 3), index=index) - - result = df.drop([(0, 2)]) - assert result.index.names == ("one", "two") - - def test_unicode_repr_issues(self): - levels = [Index(["a/\u03c3", "b/\u03c3", "c/\u03c3"]), Index([0, 1])] - codes = [np.arange(3).repeat(2), np.tile(np.arange(2), 3)] - index = MultiIndex(levels=levels, codes=codes) - - repr(index.levels) - - # NumPy bug - # repr(index.get_level_values(1)) - def test_unicode_repr_level_names(self): index = MultiIndex.from_tuples([(0, 0), (1, 1)], names=["\u0394", "i1"]) @@ -1631,15 +1398,6 @@ def test_assign_index_sequences(self): df.index = index repr(df) - def test_tuples_have_na(self): - index = MultiIndex( - levels=[[1, 0], [0, 1, 2, 3]], - codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]], - ) - - assert isna(index[4][0]) - assert isna(index.values[4][0]) - def test_duplicate_groupby_issues(self): idx_tp = [ ("600809", "20061231"), @@ -1677,31 +1435,6 @@ def test_duplicate_mi(self): result = df.loc[("foo", "bar")] tm.assert_frame_equal(result, expected) - def test_duplicated_drop_duplicates(self): - # GH 4060 - idx = MultiIndex.from_arrays(([1, 2, 3, 1, 2, 3], [1, 1, 1, 1, 2, 2])) - - expected = np.array([False, False, False, True, False, False], dtype=bool) - duplicated = idx.duplicated() - tm.assert_numpy_array_equal(duplicated, expected) - assert duplicated.dtype == bool - expected = MultiIndex.from_arrays(([1, 2, 3, 2, 3], [1, 1, 1, 2, 2])) - tm.assert_index_equal(idx.drop_duplicates(), expected) - - expected = np.array([True, False, False, False, False, False]) - duplicated = idx.duplicated(keep="last") - tm.assert_numpy_array_equal(duplicated, expected) - assert duplicated.dtype == bool - expected = MultiIndex.from_arrays(([2, 3, 1, 2, 3], [1, 1, 1, 2, 2])) - tm.assert_index_equal(idx.drop_duplicates(keep="last"), expected) - - expected = np.array([True, False, False, True, False, False]) - duplicated = idx.duplicated(keep=False) - tm.assert_numpy_array_equal(duplicated, expected) - assert duplicated.dtype == bool - expected = MultiIndex.from_arrays(([2, 3, 2, 3], [1, 1, 2, 2])) - tm.assert_index_equal(idx.drop_duplicates(keep=False), expected) - def test_multiindex_set_index(self): # segfault in #3308 d = {"t1": [2, 2.5, 3], "t2": [4, 5, 6]} @@ -1713,53 +1446,6 @@ def test_multiindex_set_index(self): # it works! df.set_index(index) - def test_datetimeindex(self): - idx1 = pd.DatetimeIndex( - ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"] * 2, - tz="Asia/Tokyo", - ) - idx2 = pd.date_range("2010/01/01", periods=6, freq="M", tz="US/Eastern") - idx = MultiIndex.from_arrays([idx1, idx2]) - - expected1 = pd.DatetimeIndex( - ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"], tz="Asia/Tokyo" - ) - - tm.assert_index_equal(idx.levels[0], expected1) - tm.assert_index_equal(idx.levels[1], idx2) - - # from datetime combos - # GH 7888 - date1 = datetime.date.today() - date2 = datetime.datetime.today() - date3 = Timestamp.today() - - for d1, d2 in itertools.product([date1, date2, date3], [date1, date2, date3]): - index = MultiIndex.from_product([[d1], [d2]]) - assert isinstance(index.levels[0], pd.DatetimeIndex) - assert isinstance(index.levels[1], pd.DatetimeIndex) - - def test_constructor_with_tz(self): - - index = pd.DatetimeIndex( - ["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific" - ) - columns = pd.DatetimeIndex( - ["2014/01/01 09:00", "2014/01/02 09:00"], name="dt2", tz="Asia/Tokyo" - ) - - result = MultiIndex.from_arrays([index, columns]) - - assert result.names == ["dt1", "dt2"] - tm.assert_index_equal(result.levels[0], index) - tm.assert_index_equal(result.levels[1], columns) - - result = MultiIndex.from_arrays([Series(index), Series(columns)]) - - assert result.names == ["dt1", "dt2"] - tm.assert_index_equal(result.levels[0], index) - tm.assert_index_equal(result.levels[1], columns) - def test_set_index_datetime(self): # GH 3950 df = DataFrame( @@ -2210,72 +1896,6 @@ def test_sort_index_categorical_multiindex(self): ) tm.assert_frame_equal(result, expected) - def test_is_lexsorted(self): - levels = [[0, 1], [0, 1, 2]] - - index = MultiIndex( - levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] - ) - assert index.is_lexsorted() - - index = MultiIndex( - levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]] - ) - assert not index.is_lexsorted() - - index = MultiIndex( - levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]] - ) - assert not index.is_lexsorted() - assert index.lexsort_depth == 0 - - def test_raise_invalid_sortorder(self): - # Test that the MultiIndex constructor raise when a incorrect sortorder is given - # Issue #28518 - - levels = [[0, 1], [0, 1, 2]] - - # Correct sortorder - MultiIndex( - levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], sortorder=2 - ) - - with pytest.raises(ValueError, match=r".* sortorder 2 with lexsort_depth 1.*"): - MultiIndex( - levels=levels, - codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], - sortorder=2, - ) - - with pytest.raises(ValueError, match=r".* sortorder 1 with lexsort_depth 0.*"): - MultiIndex( - levels=levels, - codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], - sortorder=1, - ) - - def test_lexsort_depth(self): - # Test that lexsort_depth return the correct sortorder - # when it was given to the MultiIndex const. - # Issue #28518 - - levels = [[0, 1], [0, 1, 2]] - - index = MultiIndex( - levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], sortorder=2 - ) - assert index.lexsort_depth == 2 - - index = MultiIndex( - levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], sortorder=1 - ) - assert index.lexsort_depth == 1 - - index = MultiIndex( - levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], sortorder=0 - ) - assert index.lexsort_depth == 0 - def test_sort_index_and_reconstruction(self): # 15622 diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index f7e652eb78e2d..cac6a59527a6e 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -19,6 +19,14 @@ has_c16 = hasattr(np, "complex128") +@pytest.fixture(params=[True, False]) +def skipna(request): + """ + Fixture to pass skipna to nanops functions. + """ + return request.param + + class TestnanopsDataFrame: def setup_method(self, method): np.random.seed(11235) @@ -89,28 +97,14 @@ def teardown_method(self, method): def check_results(self, targ, res, axis, check_dtype=True): res = getattr(res, "asm8", res) - res = getattr(res, "values", res) - - # timedeltas are a beast here - def _coerce_tds(targ, res): - if hasattr(targ, "dtype") and targ.dtype == "m8[ns]": - if len(targ) == 1: - targ = targ[0].item() - res = res.item() - else: - targ = targ.view("i8") - return targ, res - try: - if ( - axis != 0 - and hasattr(targ, "shape") - and targ.ndim - and targ.shape != res.shape - ): - res = np.split(res, [targ.shape[0]], axis=0)[0] - except (ValueError, IndexError): - targ, res = _coerce_tds(targ, res) + if ( + axis != 0 + and hasattr(targ, "shape") + and targ.ndim + and targ.shape != res.shape + ): + res = np.split(res, [targ.shape[0]], axis=0)[0] try: tm.assert_almost_equal(targ, res, check_dtype=check_dtype) @@ -118,9 +112,7 @@ def _coerce_tds(targ, res): # handle timedelta dtypes if hasattr(targ, "dtype") and targ.dtype == "m8[ns]": - targ, res = _coerce_tds(targ, res) - tm.assert_almost_equal(targ, res, check_dtype=check_dtype) - return + raise # There are sometimes rounding errors with # complex and object dtypes. @@ -149,29 +141,29 @@ def check_fun_data( targfunc, testarval, targarval, + skipna, check_dtype=True, empty_targfunc=None, **kwargs, ): for axis in list(range(targarval.ndim)) + [None]: - for skipna in [False, True]: - targartempval = targarval if skipna else testarval - if skipna and empty_targfunc and isna(targartempval).all(): - targ = empty_targfunc(targartempval, axis=axis, **kwargs) - else: - targ = targfunc(targartempval, axis=axis, **kwargs) + targartempval = targarval if skipna else testarval + if skipna and empty_targfunc and isna(targartempval).all(): + targ = empty_targfunc(targartempval, axis=axis, **kwargs) + else: + targ = targfunc(targartempval, axis=axis, **kwargs) - res = testfunc(testarval, axis=axis, skipna=skipna, **kwargs) + res = testfunc(testarval, axis=axis, skipna=skipna, **kwargs) + self.check_results(targ, res, axis, check_dtype=check_dtype) + if skipna: + res = testfunc(testarval, axis=axis, **kwargs) + self.check_results(targ, res, axis, check_dtype=check_dtype) + if axis is None: + res = testfunc(testarval, skipna=skipna, **kwargs) + self.check_results(targ, res, axis, check_dtype=check_dtype) + if skipna and axis is None: + res = testfunc(testarval, **kwargs) self.check_results(targ, res, axis, check_dtype=check_dtype) - if skipna: - res = testfunc(testarval, axis=axis, **kwargs) - self.check_results(targ, res, axis, check_dtype=check_dtype) - if axis is None: - res = testfunc(testarval, skipna=skipna, **kwargs) - self.check_results(targ, res, axis, check_dtype=check_dtype) - if skipna and axis is None: - res = testfunc(testarval, **kwargs) - self.check_results(targ, res, axis, check_dtype=check_dtype) if testarval.ndim <= 1: return @@ -184,12 +176,15 @@ def check_fun_data( targfunc, testarval2, targarval2, + skipna=skipna, check_dtype=check_dtype, empty_targfunc=empty_targfunc, **kwargs, ) - def check_fun(self, testfunc, targfunc, testar, empty_targfunc=None, **kwargs): + def check_fun( + self, testfunc, targfunc, testar, skipna, empty_targfunc=None, **kwargs + ): targar = testar if testar.endswith("_nan") and hasattr(self, testar[:-4]): @@ -202,6 +197,7 @@ def check_fun(self, testfunc, targfunc, testar, empty_targfunc=None, **kwargs): targfunc, testarval, targarval, + skipna=skipna, empty_targfunc=empty_targfunc, **kwargs, ) @@ -210,6 +206,7 @@ def check_funs( self, testfunc, targfunc, + skipna, allow_complex=True, allow_all_nan=True, allow_date=True, @@ -217,10 +214,10 @@ def check_funs( allow_obj=True, **kwargs, ): - self.check_fun(testfunc, targfunc, "arr_float", **kwargs) - self.check_fun(testfunc, targfunc, "arr_float_nan", **kwargs) - self.check_fun(testfunc, targfunc, "arr_int", **kwargs) - self.check_fun(testfunc, targfunc, "arr_bool", **kwargs) + self.check_fun(testfunc, targfunc, "arr_float", skipna, **kwargs) + self.check_fun(testfunc, targfunc, "arr_float_nan", skipna, **kwargs) + self.check_fun(testfunc, targfunc, "arr_int", skipna, **kwargs) + self.check_fun(testfunc, targfunc, "arr_bool", skipna, **kwargs) objs = [ self.arr_float.astype("O"), self.arr_int.astype("O"), @@ -228,18 +225,18 @@ def check_funs( ] if allow_all_nan: - self.check_fun(testfunc, targfunc, "arr_nan", **kwargs) + self.check_fun(testfunc, targfunc, "arr_nan", skipna, **kwargs) if allow_complex: - self.check_fun(testfunc, targfunc, "arr_complex", **kwargs) - self.check_fun(testfunc, targfunc, "arr_complex_nan", **kwargs) + self.check_fun(testfunc, targfunc, "arr_complex", skipna, **kwargs) + self.check_fun(testfunc, targfunc, "arr_complex_nan", skipna, **kwargs) if allow_all_nan: - self.check_fun(testfunc, targfunc, "arr_nan_nanj", **kwargs) + self.check_fun(testfunc, targfunc, "arr_nan_nanj", skipna, **kwargs) objs += [self.arr_complex.astype("O")] if allow_date: targfunc(self.arr_date) - self.check_fun(testfunc, targfunc, "arr_date", **kwargs) + self.check_fun(testfunc, targfunc, "arr_date", skipna, **kwargs) objs += [self.arr_date.astype("O")] if allow_tdelta: @@ -248,7 +245,7 @@ def check_funs( except TypeError: pass else: - self.check_fun(testfunc, targfunc, "arr_tdelta", **kwargs) + self.check_fun(testfunc, targfunc, "arr_tdelta", skipna, **kwargs) objs += [self.arr_tdelta.astype("O")] if allow_obj: @@ -260,7 +257,7 @@ def check_funs( targfunc = partial( self._badobj_wrap, func=targfunc, allow_complex=allow_complex ) - self.check_fun(testfunc, targfunc, "arr_obj", **kwargs) + self.check_fun(testfunc, targfunc, "arr_obj", skipna, **kwargs) def _badobj_wrap(self, value, func, allow_complex=True, **kwargs): if value.dtype.kind == "O": @@ -273,28 +270,22 @@ def _badobj_wrap(self, value, func, allow_complex=True, **kwargs): @pytest.mark.parametrize( "nan_op,np_op", [(nanops.nanany, np.any), (nanops.nanall, np.all)] ) - def test_nan_funcs(self, nan_op, np_op): - # TODO: allow tdelta, doesn't break tests - self.check_funs( - nan_op, np_op, allow_all_nan=False, allow_date=False, allow_tdelta=False - ) + def test_nan_funcs(self, nan_op, np_op, skipna): + self.check_funs(nan_op, np_op, skipna, allow_all_nan=False, allow_date=False) - def test_nansum(self): + def test_nansum(self, skipna): self.check_funs( nanops.nansum, np.sum, + skipna, allow_date=False, check_dtype=False, empty_targfunc=np.nansum, ) - def test_nanmean(self): + def test_nanmean(self, skipna): self.check_funs( - nanops.nanmean, - np.mean, - allow_complex=False, # TODO: allow this, doesn't break test - allow_obj=False, - allow_date=False, + nanops.nanmean, np.mean, skipna, allow_obj=False, allow_date=False, ) def test_nanmean_overflow(self): @@ -336,22 +327,24 @@ def test_returned_dtype(self, dtype): else: assert result.dtype == dtype - def test_nanmedian(self): + def test_nanmedian(self, skipna): with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) self.check_funs( nanops.nanmedian, np.median, + skipna, allow_complex=False, allow_date=False, allow_obj="convert", ) @pytest.mark.parametrize("ddof", range(3)) - def test_nanvar(self, ddof): + def test_nanvar(self, ddof, skipna): self.check_funs( nanops.nanvar, np.var, + skipna, allow_complex=False, allow_date=False, allow_obj="convert", @@ -359,10 +352,11 @@ def test_nanvar(self, ddof): ) @pytest.mark.parametrize("ddof", range(3)) - def test_nanstd(self, ddof): + def test_nanstd(self, ddof, skipna): self.check_funs( nanops.nanstd, np.std, + skipna, allow_complex=False, allow_date=False, allow_obj="convert", @@ -371,13 +365,14 @@ def test_nanstd(self, ddof): @td.skip_if_no_scipy @pytest.mark.parametrize("ddof", range(3)) - def test_nansem(self, ddof): + def test_nansem(self, ddof, skipna): from scipy.stats import sem with np.errstate(invalid="ignore"): self.check_funs( nanops.nansem, sem, + skipna, allow_complex=False, allow_date=False, allow_tdelta=False, @@ -388,10 +383,10 @@ def test_nansem(self, ddof): @pytest.mark.parametrize( "nan_op,np_op", [(nanops.nanmin, np.min), (nanops.nanmax, np.max)] ) - def test_nanops_with_warnings(self, nan_op, np_op): + def test_nanops_with_warnings(self, nan_op, np_op, skipna): with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) - self.check_funs(nan_op, np_op, allow_obj=False) + self.check_funs(nan_op, np_op, skipna, allow_obj=False) def _argminmax_wrap(self, value, axis=None, func=None): res = func(value, axis) @@ -408,17 +403,17 @@ def _argminmax_wrap(self, value, axis=None, func=None): res = -1 return res - def test_nanargmax(self): + def test_nanargmax(self, skipna): with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) func = partial(self._argminmax_wrap, func=np.argmax) - self.check_funs(nanops.nanargmax, func, allow_obj=False) + self.check_funs(nanops.nanargmax, func, skipna, allow_obj=False) - def test_nanargmin(self): + def test_nanargmin(self, skipna): with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) func = partial(self._argminmax_wrap, func=np.argmin) - self.check_funs(nanops.nanargmin, func, allow_obj=False) + self.check_funs(nanops.nanargmin, func, skipna, allow_obj=False) def _skew_kurt_wrap(self, values, axis=None, func=None): if not isinstance(values.dtype.type, np.floating): @@ -433,7 +428,7 @@ def _skew_kurt_wrap(self, values, axis=None, func=None): return result @td.skip_if_no_scipy - def test_nanskew(self): + def test_nanskew(self, skipna): from scipy.stats import skew func = partial(self._skew_kurt_wrap, func=skew) @@ -441,13 +436,14 @@ def test_nanskew(self): self.check_funs( nanops.nanskew, func, + skipna, allow_complex=False, allow_date=False, allow_tdelta=False, ) @td.skip_if_no_scipy - def test_nankurt(self): + def test_nankurt(self, skipna): from scipy.stats import kurtosis func1 = partial(kurtosis, fisher=True) @@ -456,15 +452,17 @@ def test_nankurt(self): self.check_funs( nanops.nankurt, func, + skipna, allow_complex=False, allow_date=False, allow_tdelta=False, ) - def test_nanprod(self): + def test_nanprod(self, skipna): self.check_funs( nanops.nanprod, np.prod, + skipna, allow_date=False, allow_tdelta=False, empty_targfunc=np.nanprod, @@ -602,7 +600,7 @@ def test_nancorr_spearman(self): def test_invalid_method(self): targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1] targ1 = np.corrcoef(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1] - msg = "Unkown method 'foo', expected one of 'kendall', 'spearman'" + msg = "Unknown method 'foo', expected one of 'kendall', 'spearman'" with pytest.raises(ValueError, match=msg): self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="foo") diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 1338d801e39f4..6260d13524da3 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -41,6 +41,7 @@ def assert_series_or_index_equal(left, right): ("join", (",",), {}), ("ljust", (10,), {}), ("match", ("a",), {}), + ("fullmatch", ("a",), {}), ("normalize", ("NFC",), {}), ("pad", (10,), {}), ("partition", (" ",), {"expand": False}), @@ -1157,6 +1158,18 @@ def test_repeat(self): assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) + def test_repeat_with_null(self): + # GH: 31632 + values = Series(["a", None], dtype="string") + result = values.str.repeat([3, 4]) + exp = Series(["aaa", None], dtype="string") + tm.assert_series_equal(result, exp) + + values = Series(["a", "b"], dtype="string") + result = values.str.repeat([3, None]) + exp = Series(["aaa", None], dtype="string") + tm.assert_series_equal(result, exp) + def test_match(self): # New match behavior introduced in 0.13 values = Series(["fooBAD__barBAD", np.nan, "foo"]) @@ -1164,9 +1177,9 @@ def test_match(self): exp = Series([True, np.nan, False]) tm.assert_series_equal(result, exp) - values = Series(["fooBAD__barBAD", np.nan, "foo"]) + values = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"]) result = values.str.match(".*BAD[_]+.*BAD") - exp = Series([True, np.nan, False]) + exp = Series([True, True, np.nan, False]) tm.assert_series_equal(result, exp) # mixed @@ -1196,6 +1209,22 @@ def test_match(self): exp = Series([True, np.nan, np.nan]) tm.assert_series_equal(exp, res) + def test_fullmatch(self): + # GH 32806 + values = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"]) + result = values.str.fullmatch(".*BAD[_]+.*BAD") + exp = Series([True, False, np.nan, False]) + tm.assert_series_equal(result, exp) + + # Make sure that the new string arrays work + string_values = Series( + ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype="string" + ) + result = string_values.str.fullmatch(".*BAD[_]+.*BAD") + # Result is nullable boolean with StringDtype + string_exp = Series([True, False, np.nan, False], dtype="boolean") + tm.assert_series_equal(result, string_exp) + def test_extract_expand_None(self): values = Series(["fooBAD__barBAD", np.nan, "foo"]) with pytest.raises(ValueError, match="expand must be True or False"): @@ -3372,6 +3401,9 @@ def test_match_findall_flags(self): result = data.str.match(pat, flags=re.IGNORECASE) assert result[0] + result = data.str.fullmatch(pat, flags=re.IGNORECASE) + assert result[0] + result = data.str.findall(pat, flags=re.IGNORECASE) assert result[0][0] == ("dave", "google", "com") @@ -3592,3 +3624,12 @@ def test_string_array_extract(): result = result.astype(object) tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("klass", [tuple, list, np.array, pd.Series, pd.Index]) +def test_cat_different_classes(klass): + # https://github.com/pandas-dev/pandas/issues/33425 + s = pd.Series(["a", "b", "c"]) + result = s.str.cat(klass(["x", "y", "z"])) + expected = pd.Series(["ax", "by", "cz"]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/tools/test_to_datetime.py similarity index 94% rename from pandas/tests/indexes/datetimes/test_tools.py rename to pandas/tests/tools/test_to_datetime.py index 0a774e9c0f008..d2049892705ea 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2,7 +2,7 @@ import calendar from collections import deque -from datetime import datetime, time +from datetime import datetime, timedelta import locale from dateutil.parser import parse @@ -323,8 +323,25 @@ def test_to_datetime_parse_tzname_or_tzoffset(self, fmt, dates, expected_dates): expected = pd.Index(expected_dates) tm.assert_equal(result, expected) - with pytest.raises(ValueError): - pd.to_datetime(dates, format=fmt, utc=True) + def test_to_datetime_parse_tzname_or_tzoffset_different_tz_to_utc(self): + # GH 32792 + dates = [ + "2010-01-01 12:00:00 +0100", + "2010-01-01 12:00:00 -0100", + "2010-01-01 12:00:00 +0300", + "2010-01-01 12:00:00 +0400", + ] + expected_dates = [ + "2010-01-01 11:00:00+00:00", + "2010-01-01 13:00:00+00:00", + "2010-01-01 09:00:00+00:00", + "2010-01-01 08:00:00+00:00", + ] + fmt = "%Y-%m-%d %H:%M:%S %z" + + result = pd.to_datetime(dates, format=fmt, utc=True) + expected = pd.DatetimeIndex(expected_dates) + tm.assert_index_equal(result, expected) @pytest.mark.parametrize( "offset", ["+0", "-1foo", "UTCbar", ":10", "+01:000:01", ""] @@ -1056,6 +1073,23 @@ def test_to_datetime_with_format_out_of_bounds(self, dt_str): with pytest.raises(OutOfBoundsDatetime): pd.to_datetime(dt_str, format="%Y%m%d") + def test_to_datetime_utc(self): + arr = np.array([parse("2012-06-13T01:39:00Z")], dtype=object) + + result = to_datetime(arr, utc=True) + assert result.tz is pytz.utc + + def test_to_datetime_fixed_offset(self): + from pandas.tests.indexes.datetimes.test_timezones import fixed_off + + dates = [ + datetime(2000, 1, 1, tzinfo=fixed_off), + datetime(2000, 1, 2, tzinfo=fixed_off), + datetime(2000, 1, 3, tzinfo=fixed_off), + ] + result = to_datetime(dates) + assert result.tz == fixed_off + class TestToDatetimeUnit: @pytest.mark.parametrize("cache", [True, False]) @@ -1376,6 +1410,86 @@ def test_to_datetime_errors_ignore_utc_true(self): expected = DatetimeIndex(["1970-01-01 00:00:01"], tz="UTC") tm.assert_index_equal(result, expected) + # TODO: this is moved from tests.series.test_timeseries, may be redundant + def test_to_datetime_unit(self): + + epoch = 1370745748 + s = Series([epoch + t for t in range(20)]) + result = to_datetime(s, unit="s") + expected = Series( + [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] + ) + tm.assert_series_equal(result, expected) + + s = Series([epoch + t for t in range(20)]).astype(float) + result = to_datetime(s, unit="s") + expected = Series( + [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] + ) + tm.assert_series_equal(result, expected) + + s = Series([epoch + t for t in range(20)] + [iNaT]) + result = to_datetime(s, unit="s") + expected = Series( + [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] + + [NaT] + ) + tm.assert_series_equal(result, expected) + + s = Series([epoch + t for t in range(20)] + [iNaT]).astype(float) + result = to_datetime(s, unit="s") + expected = Series( + [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] + + [NaT] + ) + tm.assert_series_equal(result, expected) + + # GH13834 + s = Series([epoch + t for t in np.arange(0, 2, 0.25)] + [iNaT]).astype(float) + result = to_datetime(s, unit="s") + expected = Series( + [ + Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) + for t in np.arange(0, 2, 0.25) + ] + + [NaT] + ) + tm.assert_series_equal(result, expected) + + s = pd.concat( + [Series([epoch + t for t in range(20)]).astype(float), Series([np.nan])], + ignore_index=True, + ) + result = to_datetime(s, unit="s") + expected = Series( + [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] + + [NaT] + ) + tm.assert_series_equal(result, expected) + + result = to_datetime([1, 2, "NaT", pd.NaT, np.nan], unit="D") + expected = DatetimeIndex( + [Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 3 + ) + tm.assert_index_equal(result, expected) + + msg = "non convertible value foo with the unit 'D'" + with pytest.raises(ValueError, match=msg): + to_datetime([1, 2, "foo"], unit="D") + msg = "cannot convert input 111111111 with the unit 'D'" + with pytest.raises(OutOfBoundsDatetime, match=msg): + to_datetime([1, 2, 111111111], unit="D") + + # coerce we can process + expected = DatetimeIndex( + [Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 1 + ) + result = to_datetime([1, 2, "foo"], unit="D", errors="coerce") + tm.assert_index_equal(result, expected) + + result = to_datetime([1, 2, 111111111], unit="D", errors="coerce") + tm.assert_index_equal(result, expected) + class TestToDatetimeMisc: def test_to_datetime_barely_out_of_bounds(self): @@ -1748,6 +1862,18 @@ def test_to_datetime_infer_datetime_format_series_start_with_nans(self, cache): pd.to_datetime(s, infer_datetime_format=True, cache=cache), ) + @pytest.mark.parametrize( + "tz_name, offset", [("UTC", 0), ("UTC-3", 180), ("UTC+3", -180)] + ) + def test_infer_datetime_format_tz_name(self, tz_name, offset): + # GH 33133 + s = pd.Series([f"2019-02-02 08:07:13 {tz_name}"]) + result = to_datetime(s, infer_datetime_format=True) + expected = pd.Series( + [pd.Timestamp("2019-02-02 08:07:13").tz_localize(pytz.FixedOffset(offset))] + ) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_iso8601_noleading_0s(self, cache): # GH 11871 @@ -2032,52 +2158,6 @@ def test_parsers_timestring(self, cache): assert result4 == exp_now assert result5 == exp_now - @td.skip_if_has_locale - def test_parsers_time(self): - # GH11818 - strings = [ - "14:15", - "1415", - "2:15pm", - "0215pm", - "14:15:00", - "141500", - "2:15:00pm", - "021500pm", - time(14, 15), - ] - expected = time(14, 15) - - for time_string in strings: - assert tools.to_time(time_string) == expected - - new_string = "14.15" - msg = r"Cannot convert arg \['14\.15'\] to a time" - with pytest.raises(ValueError, match=msg): - tools.to_time(new_string) - assert tools.to_time(new_string, format="%H.%M") == expected - - arg = ["14:15", "20:20"] - expected_arr = [time(14, 15), time(20, 20)] - assert tools.to_time(arg) == expected_arr - assert tools.to_time(arg, format="%H:%M") == expected_arr - assert tools.to_time(arg, infer_time_format=True) == expected_arr - assert tools.to_time(arg, format="%I:%M%p", errors="coerce") == [None, None] - - res = tools.to_time(arg, format="%I:%M%p", errors="ignore") - tm.assert_numpy_array_equal(res, np.array(arg, dtype=np.object_)) - - with pytest.raises(ValueError): - tools.to_time(arg, format="%I:%M%p", errors="raise") - - tm.assert_series_equal( - tools.to_time(Series(arg, name="test")), Series(expected_arr, name="test") - ) - - res = tools.to_time(np.array(arg)) - assert isinstance(res, list) - assert res == expected_arr - @pytest.mark.parametrize("cache", [True, False]) @pytest.mark.parametrize( "dt_string, tz, dt_string_repr", diff --git a/pandas/tests/tools/test_numeric.py b/pandas/tests/tools/test_to_numeric.py similarity index 96% rename from pandas/tests/tools/test_numeric.py rename to pandas/tests/tools/test_to_numeric.py index 19385e797467c..263887a8ea36e 100644 --- a/pandas/tests/tools/test_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -627,3 +627,25 @@ def test_non_coerce_uint64_conflict(errors, exp): else: result = to_numeric(ser, errors=errors) tm.assert_series_equal(result, ser) + + +@pytest.mark.parametrize("dc1", ["integer", "float", "unsigned"]) +@pytest.mark.parametrize("dc2", ["integer", "float", "unsigned"]) +def test_downcast_empty(dc1, dc2): + # GH32493 + + tm.assert_numpy_array_equal( + pd.to_numeric([], downcast=dc1), + pd.to_numeric([], downcast=dc2), + check_dtype=False, + ) + + +def test_failure_to_convert_uint64_string_to_NaN(): + # GH 32394 + result = to_numeric("uint64", errors="coerce") + assert np.isnan(result) + + ser = Series([32, 64, np.nan]) + result = to_numeric(pd.Series(["32", "64", "uint64"]), errors="coerce") + tm.assert_series_equal(result, ser) diff --git a/pandas/tests/tools/test_to_time.py b/pandas/tests/tools/test_to_time.py new file mode 100644 index 0000000000000..17ab492aca725 --- /dev/null +++ b/pandas/tests/tools/test_to_time.py @@ -0,0 +1,58 @@ +from datetime import time + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import Series +import pandas._testing as tm +from pandas.core.tools.datetimes import to_time + + +class TestToTime: + @td.skip_if_has_locale + def test_parsers_time(self): + # GH#11818 + strings = [ + "14:15", + "1415", + "2:15pm", + "0215pm", + "14:15:00", + "141500", + "2:15:00pm", + "021500pm", + time(14, 15), + ] + expected = time(14, 15) + + for time_string in strings: + assert to_time(time_string) == expected + + new_string = "14.15" + msg = r"Cannot convert arg \['14\.15'\] to a time" + with pytest.raises(ValueError, match=msg): + to_time(new_string) + assert to_time(new_string, format="%H.%M") == expected + + arg = ["14:15", "20:20"] + expected_arr = [time(14, 15), time(20, 20)] + assert to_time(arg) == expected_arr + assert to_time(arg, format="%H:%M") == expected_arr + assert to_time(arg, infer_time_format=True) == expected_arr + assert to_time(arg, format="%I:%M%p", errors="coerce") == [None, None] + + res = to_time(arg, format="%I:%M%p", errors="ignore") + tm.assert_numpy_array_equal(res, np.array(arg, dtype=np.object_)) + + with pytest.raises(ValueError): + to_time(arg, format="%I:%M%p", errors="raise") + + tm.assert_series_equal( + to_time(Series(arg, name="test")), Series(expected_arr, name="test") + ) + + res = to_time(np.array(arg)) + assert isinstance(res, list) + assert res == expected_arr diff --git a/pandas/tests/indexes/timedeltas/test_tools.py b/pandas/tests/tools/test_to_timedelta.py similarity index 100% rename from pandas/tests/indexes/timedeltas/test_tools.py rename to pandas/tests/tools/test_to_timedelta.py diff --git a/pandas/tests/tslibs/test_ccalendar.py b/pandas/tests/tslibs/test_ccalendar.py index 6f6e32411a784..aab86d3a2df69 100644 --- a/pandas/tests/tslibs/test_ccalendar.py +++ b/pandas/tests/tslibs/test_ccalendar.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import date, datetime import numpy as np import pytest @@ -25,3 +25,26 @@ def test_get_day_of_year_dt(): expected = (dt - dt.replace(month=1, day=1)).days + 1 assert result == expected + + +@pytest.mark.parametrize( + "input_date_tuple, expected_iso_tuple", + [ + [(2020, 1, 1), (2020, 1, 3)], + [(2019, 12, 31), (2020, 1, 2)], + [(2019, 12, 30), (2020, 1, 1)], + [(2009, 12, 31), (2009, 53, 4)], + [(2010, 1, 1), (2009, 53, 5)], + [(2010, 1, 3), (2009, 53, 7)], + [(2010, 1, 4), (2010, 1, 1)], + [(2006, 1, 1), (2005, 52, 7)], + [(2005, 12, 31), (2005, 52, 6)], + [(2008, 12, 28), (2008, 52, 7)], + [(2008, 12, 29), (2009, 1, 1)], + ], +) +def test_dt_correct_iso_8601_year_week_and_day(input_date_tuple, expected_iso_tuple): + result = ccalendar.get_iso_calendar(*input_date_tuple) + expected_from_date_isocalendar = date(*input_date_tuple).isocalendar() + assert result == expected_from_date_isocalendar + assert result == expected_iso_tuple diff --git a/pandas/tests/tslibs/test_timedeltas.py b/pandas/tests/tslibs/test_timedeltas.py index 86d5cc749b5e1..c87752ccf151e 100644 --- a/pandas/tests/tslibs/test_timedeltas.py +++ b/pandas/tests/tslibs/test_timedeltas.py @@ -28,3 +28,9 @@ def test_delta_to_nanoseconds_error(): with pytest.raises(TypeError, match=""): delta_to_nanoseconds(obj) + + +def test_huge_nanoseconds_overflow(): + # GH 32402 + assert delta_to_nanoseconds(Timedelta(1e10)) == 1e10 + assert delta_to_nanoseconds(Timedelta(nanoseconds=1e10)) == 1e10 diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 3090343ba2fd9..fe3e1ff906919 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -1,5 +1,6 @@ import pytest +import pandas as pd from pandas import DataFrame import pandas._testing as tm @@ -177,6 +178,7 @@ def test_frame_equal_block_mismatch(by_blocks_fixture, obj_fixture): msg = f"""{obj}\\.iloc\\[:, 1\\] \\(column name="B"\\) are different {obj}\\.iloc\\[:, 1\\] \\(column name="B"\\) values are different \\(33\\.33333 %\\) +\\[index\\]: \\[0, 1, 2\\] \\[left\\]: \\[4, 5, 6\\] \\[right\\]: \\[4, 5, 7\\]""" @@ -196,6 +198,7 @@ def test_frame_equal_block_mismatch(by_blocks_fixture, obj_fixture): """{obj}\\.iloc\\[:, 1\\] \\(column name="E"\\) are different {obj}\\.iloc\\[:, 1\\] \\(column name="E"\\) values are different \\(33\\.33333 %\\) +\\[index\\]: \\[0, 1, 2\\] \\[left\\]: \\[é, è, ë\\] \\[right\\]: \\[é, è, e̊\\]""", ), @@ -205,6 +208,7 @@ def test_frame_equal_block_mismatch(by_blocks_fixture, obj_fixture): """{obj}\\.iloc\\[:, 0\\] \\(column name="A"\\) are different {obj}\\.iloc\\[:, 0\\] \\(column name="A"\\) values are different \\(100\\.0 %\\) +\\[index\\]: \\[0, 1, 2\\] \\[left\\]: \\[á, à, ä\\] \\[right\\]: \\[a, a, a\\]""", ), @@ -218,3 +222,41 @@ def test_frame_equal_unicode(df1, df2, msg, by_blocks_fixture, obj_fixture): msg = msg.format(obj=obj_fixture) with pytest.raises(AssertionError, match=msg): tm.assert_frame_equal(df1, df2, by_blocks=by_blocks_fixture, obj=obj_fixture) + + +def test_assert_frame_equal_extension_dtype_mismatch(): + # https://github.com/pandas-dev/pandas/issues/32747 + left = DataFrame({"a": [1, 2, 3]}, dtype="Int64") + right = left.astype(int) + + msg = ( + "Attributes of DataFrame\\.iloc\\[:, 0\\] " + '\\(column name="a"\\) are different\n\n' + 'Attribute "dtype" are different\n' + "\\[left\\]: Int64\n" + "\\[right\\]: int[32|64]" + ) + + tm.assert_frame_equal(left, right, check_dtype=False) + + with pytest.raises(AssertionError, match=msg): + tm.assert_frame_equal(left, right, check_dtype=True) + + +def test_assert_frame_equal_interval_dtype_mismatch(): + # https://github.com/pandas-dev/pandas/issues/32747 + left = DataFrame({"a": [pd.Interval(0, 1)]}, dtype="interval") + right = left.astype(object) + + msg = ( + "Attributes of DataFrame\\.iloc\\[:, 0\\] " + '\\(column name="a"\\) are different\n\n' + 'Attribute "dtype" are different\n' + "\\[left\\]: interval\\[int64\\]\n" + "\\[right\\]: object" + ) + + tm.assert_frame_equal(left, right, check_dtype=False) + + with pytest.raises(AssertionError, match=msg): + tm.assert_frame_equal(left, right, check_dtype=True) diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index eaf0824f52927..8bf3d82672695 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -1,5 +1,6 @@ import pytest +import pandas as pd from pandas import Categorical, DataFrame, Series import pandas._testing as tm @@ -168,6 +169,7 @@ def test_series_equal_values_mismatch(check_less_precise): msg = """Series are different Series values are different \\(33\\.33333 %\\) +\\[index\\]: \\[0, 1, 2\\] \\[left\\]: \\[1, 2, 3\\] \\[right\\]: \\[1, 2, 4\\]""" @@ -194,3 +196,58 @@ def test_series_equal_categorical_mismatch(check_categorical): tm.assert_series_equal(s1, s2, check_categorical=check_categorical) else: _assert_series_equal_both(s1, s2, check_categorical=check_categorical) + + +def test_assert_series_equal_extension_dtype_mismatch(): + # https://github.com/pandas-dev/pandas/issues/32747 + left = Series(pd.array([1, 2, 3], dtype="Int64")) + right = left.astype(int) + + msg = """Attributes of Series are different + +Attribute "dtype" are different +\\[left\\]: Int64 +\\[right\\]: int[32|64]""" + + tm.assert_series_equal(left, right, check_dtype=False) + + with pytest.raises(AssertionError, match=msg): + tm.assert_series_equal(left, right, check_dtype=True) + + +def test_assert_series_equal_interval_dtype_mismatch(): + # https://github.com/pandas-dev/pandas/issues/32747 + left = Series([pd.Interval(0, 1)], dtype="interval") + right = left.astype(object) + + msg = """Attributes of Series are different + +Attribute "dtype" are different +\\[left\\]: interval\\[int64\\] +\\[right\\]: object""" + + tm.assert_series_equal(left, right, check_dtype=False) + + with pytest.raises(AssertionError, match=msg): + tm.assert_series_equal(left, right, check_dtype=True) + + +def test_series_equal_series_type(): + class MySeries(Series): + pass + + s1 = Series([1, 2]) + s2 = Series([1, 2]) + s3 = MySeries([1, 2]) + + tm.assert_series_equal(s1, s2, check_series_type=False) + tm.assert_series_equal(s1, s2, check_series_type=True) + + tm.assert_series_equal(s1, s3, check_series_type=False) + tm.assert_series_equal(s3, s1, check_series_type=False) + + with pytest.raises(AssertionError, match="Series classes are different"): + tm.assert_series_equal(s1, s3, check_series_type=True) + + with pytest.raises(AssertionError, match="Series classes are different"): + tm.assert_series_equal(s3, s1, check_series_type=True) diff --git a/pandas/tests/util/test_deprecate_nonkeyword_arguments.py b/pandas/tests/util/test_deprecate_nonkeyword_arguments.py new file mode 100644 index 0000000000000..05bc617232bdd --- /dev/null +++ b/pandas/tests/util/test_deprecate_nonkeyword_arguments.py @@ -0,0 +1,101 @@ +""" +Tests for the `deprecate_nonkeyword_arguments` decorator +""" + +import warnings + +from pandas.util._decorators import deprecate_nonkeyword_arguments + +import pandas._testing as tm + + +@deprecate_nonkeyword_arguments(version="1.1", allowed_args=["a", "b"]) +def f(a, b=0, c=0, d=0): + return a + b + c + d + + +def test_one_argument(): + with tm.assert_produces_warning(None): + assert f(19) == 19 + + +def test_one_and_one_arguments(): + with tm.assert_produces_warning(None): + assert f(19, d=6) == 25 + + +def test_two_arguments(): + with tm.assert_produces_warning(None): + assert f(1, 5) == 6 + + +def test_two_and_two_arguments(): + with tm.assert_produces_warning(None): + assert f(1, 3, c=3, d=5) == 12 + + +def test_three_arguments(): + with tm.assert_produces_warning(FutureWarning): + assert f(6, 3, 3) == 12 + + +def test_four_arguments(): + with tm.assert_produces_warning(FutureWarning): + assert f(1, 2, 3, 4) == 10 + + +@deprecate_nonkeyword_arguments(version="1.1") +def g(a, b=0, c=0, d=0): + with tm.assert_produces_warning(None): + return a + b + c + d + + +def test_one_and_three_arguments_default_allowed_args(): + with tm.assert_produces_warning(None): + assert g(1, b=3, c=3, d=5) == 12 + + +def test_three_arguments_default_allowed_args(): + with tm.assert_produces_warning(FutureWarning): + assert g(6, 3, 3) == 12 + + +def test_three_positional_argument_with_warning_message_analysis(): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + assert g(6, 3, 3) == 12 + assert len(w) == 1 + for actual_warning in w: + assert actual_warning.category == FutureWarning + assert str(actual_warning.message) == ( + "Starting with Pandas version 1.1 all arguments of g " + "except for the argument 'a' will be keyword-only" + ) + + +@deprecate_nonkeyword_arguments(version="1.1") +def h(a=0, b=0, c=0, d=0): + return a + b + c + d + + +def test_all_keyword_arguments(): + with tm.assert_produces_warning(None): + assert h(a=1, b=2) == 3 + + +def test_one_positional_argument(): + with tm.assert_produces_warning(FutureWarning): + assert h(23) == 23 + + +def test_one_positional_argument_with_warning_message_analysis(): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + assert h(19) == 19 + assert len(w) == 1 + for actual_warning in w: + assert actual_warning.category == FutureWarning + assert str(actual_warning.message) == ( + "Starting with Pandas version 1.1 all arguments " + "of h will be keyword-only" + ) diff --git a/pandas/tests/util/test_doc.py b/pandas/tests/util/test_doc.py index 7e5e24456b9a7..50859564e654f 100644 --- a/pandas/tests/util/test_doc.py +++ b/pandas/tests/util/test_doc.py @@ -14,13 +14,15 @@ def cumsum(whatever): @doc( cumsum, - """ - Examples - -------- + dedent( + """ + Examples + -------- - >>> cumavg([1, 2, 3]) - 2 - """, + >>> cumavg([1, 2, 3]) + 2 + """ + ), method="cumavg", operation="average", ) diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index 606520c6d68ca..bb93c70b8a597 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -3,7 +3,7 @@ from pandas import DataFrame, Series import pandas._testing as tm -from pandas.api.indexers import BaseIndexer +from pandas.api.indexers import BaseIndexer, FixedForwardWindowIndexer from pandas.core.window.indexers import ExpandingIndexer @@ -80,3 +80,49 @@ def get_window_bounds(self, num_values, min_periods, center, closed): indexer = CustomIndexer() with pytest.raises(NotImplementedError, match="BaseIndexer subclasses not"): df.rolling(indexer, win_type="boxcar") + + +@pytest.mark.parametrize("func", ["std", "var", "count", "skew", "cov", "corr"]) +def test_notimplemented_functions(func): + # GH 32865 + class CustomIndexer(BaseIndexer): + def get_window_bounds(self, num_values, min_periods, center, closed): + return np.array([0, 1]), np.array([1, 2]) + + df = DataFrame({"values": range(2)}) + indexer = CustomIndexer() + with pytest.raises(NotImplementedError, match=f"{func} is not supported"): + getattr(df.rolling(indexer), func)() + + +@pytest.mark.parametrize("constructor", [Series, DataFrame]) +@pytest.mark.parametrize( + "func,alt_func,expected", + [ + ("min", np.min, [0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 6.0, 7.0, 8.0, np.nan]), + ("max", np.max, [2.0, 3.0, 4.0, 100.0, 100.0, 100.0, 8.0, 9.0, 9.0, np.nan]), + ], +) +def test_rolling_forward_window(constructor, func, alt_func, expected): + # GH 32865 + values = np.arange(10) + values[5] = 100.0 + + indexer = FixedForwardWindowIndexer(window_size=3) + + match = "Forward-looking windows can't have center=True" + with pytest.raises(ValueError, match=match): + rolling = constructor(values).rolling(window=indexer, center=True) + result = getattr(rolling, func)() + + match = "Forward-looking windows don't support setting the closed argument" + with pytest.raises(ValueError, match=match): + rolling = constructor(values).rolling(window=indexer, closed="right") + result = getattr(rolling, func)() + + rolling = constructor(values).rolling(window=indexer, min_periods=2) + result = getattr(rolling, func)() + expected = constructor(expected) + tm.assert_equal(result, expected) + expected2 = constructor(rolling.apply(lambda x: alt_func(x))) + tm.assert_equal(result, expected2) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 5f5e10b5dd497..0c5289cd78fed 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -709,20 +709,25 @@ def test_rolling_cov_offset(self): tm.assert_series_equal(result, expected2) def test_rolling_on_decreasing_index(self): - # GH-19248 + # GH-19248, GH-32385 index = [ - Timestamp("20190101 09:00:00"), - Timestamp("20190101 09:00:02"), - Timestamp("20190101 09:00:03"), - Timestamp("20190101 09:00:05"), - Timestamp("20190101 09:00:06"), + Timestamp("20190101 09:00:30"), + Timestamp("20190101 09:00:27"), + Timestamp("20190101 09:00:20"), + Timestamp("20190101 09:00:18"), + Timestamp("20190101 09:00:10"), ] - df = DataFrame({"column": [3, 4, 4, 2, 1]}, index=reversed(index)) - result = df.rolling("2s").min() - expected = DataFrame( - {"column": [3.0, 3.0, 3.0, 2.0, 1.0]}, index=reversed(index) - ) + df = DataFrame({"column": [3, 4, 4, 5, 6]}, index=index) + result = df.rolling("5s").min() + expected = DataFrame({"column": [3.0, 3.0, 4.0, 4.0, 6.0]}, index=index) + tm.assert_frame_equal(result, expected) + + def test_rolling_on_empty(self): + # GH-32385 + df = DataFrame({"column": []}, index=[]) + result = df.rolling("5s").min() + expected = DataFrame({"column": []}, index=[]) tm.assert_frame_equal(result, expected) def test_rolling_on_multi_index_level(self): diff --git a/pandas/tests/window/test_window.py b/pandas/tests/window/test_window.py index 41b9d9e84f27e..c7c45f0e5e0de 100644 --- a/pandas/tests/window/test_window.py +++ b/pandas/tests/window/test_window.py @@ -29,11 +29,10 @@ def test_constructor(self, which): c(win_type="boxcar", window=2, min_periods=1, center=False) # not valid - msg = "|".join(["min_periods must be an integer", "center must be a boolean"]) for w in [2.0, "foo", np.array([2])]: - with pytest.raises(ValueError, match=msg): + with pytest.raises(ValueError, match="min_periods must be an integer"): c(win_type="boxcar", window=2, min_periods=w) - with pytest.raises(ValueError, match=msg): + with pytest.raises(ValueError, match="center must be a boolean"): c(win_type="boxcar", window=2, min_periods=1, center=w) for wt in ["foobar", 1]: diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 1a1b7e8e1bd08..12320cd52cec8 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -4,7 +4,6 @@ import warnings import numpy as np -from pytz import AmbiguousTimeError from pandas._libs.algos import unique_deltas from pandas._libs.tslibs import Timedelta, Timestamp @@ -20,7 +19,7 @@ from pandas.core.dtypes.common import ( is_datetime64_dtype, - is_period_arraylike, + is_period_dtype, is_timedelta64_dtype, ) from pandas.core.dtypes.generic import ABCSeries @@ -92,23 +91,23 @@ def to_offset(freq) -> Optional[DateOffset]: See Also -------- - DateOffset + DateOffset : Standard kind of date increment used for a date range. Examples -------- - >>> to_offset('5min') + >>> to_offset("5min") <5 * Minutes> - >>> to_offset('1D1H') + >>> to_offset("1D1H") <25 * Hours> - >>> to_offset(('W', 2)) + >>> to_offset(("W", 2)) <2 * Weeks: weekday=6> - >>> to_offset((2, 'B')) + >>> to_offset((2, "B")) <2 * BusinessDays> - >>> to_offset(datetime.timedelta(days=1)) + >>> to_offset(pd.Timedelta(days=1)) >>> to_offset(Hour()) @@ -148,13 +147,11 @@ def to_offset(freq) -> Optional[DateOffset]: delta = None stride_sign = None try: - splitted = re.split(libfreqs.opattern, freq) - if splitted[-1] != "" and not splitted[-1].isspace(): + split = re.split(libfreqs.opattern, freq) + if split[-1] != "" and not split[-1].isspace(): # the last element must be blank raise ValueError("last element must be blank") - for sep, stride, name in zip( - splitted[0::4], splitted[1::4], splitted[2::4] - ): + for sep, stride, name in zip(split[0::4], split[1::4], split[2::4]): if sep != "" and not sep.isspace(): raise ValueError("separator must be spaces") prefix = libfreqs._lite_rule_alias.get(name) or name @@ -250,9 +247,14 @@ def infer_freq(index, warn: bool = True) -> Optional[str]: Returns ------- str or None - None if no discernible frequency - TypeError if the index is not datetime-like - ValueError if there are less than three values. + None if no discernible frequency. + + Raises + ------ + TypeError + If the index is not datetime-like. + ValueError + If there are fewer than three values. """ import pandas as pd @@ -270,7 +272,7 @@ def infer_freq(index, warn: bool = True) -> Optional[str]: index = values inferer: _FrequencyInferer - if is_period_arraylike(index): + if is_period_dtype(index): raise TypeError( "PeriodIndex given. Check the `freq` attribute " "instead of using infer_freq." @@ -285,13 +287,10 @@ def infer_freq(index, warn: bool = True) -> Optional[str]: raise TypeError( f"cannot infer freq from a non-convertible index type {type(index)}" ) - index = index.values + index = index._values if not isinstance(index, pd.DatetimeIndex): - try: - index = pd.DatetimeIndex(index) - except AmbiguousTimeError: - index = pd.DatetimeIndex(index.asi8) + index = pd.DatetimeIndex(index) inferer = _FrequencyInferer(index, warn=warn) return inferer.get_freq() @@ -304,13 +303,13 @@ class _FrequencyInferer: def __init__(self, index, warn: bool = True): self.index = index - self.values = index.asi8 + self.i8values = index.asi8 # This moves the values, which are implicitly in UTC, to the # the timezone so they are in local time if hasattr(index, "tz"): if index.tz is not None: - self.values = tz_convert(self.values, UTC, index.tz) + self.i8values = tz_convert(self.i8values, UTC, index.tz) self.warn = warn @@ -323,10 +322,12 @@ def __init__(self, index, warn: bool = True): @cache_readonly def deltas(self): - return unique_deltas(self.values) + return unique_deltas(self.i8values) @cache_readonly def deltas_asi8(self): + # NB: we cannot use self.i8values here because we may have converted + # the tz in __init__ return unique_deltas(self.index.asi8) @cache_readonly @@ -340,7 +341,7 @@ def is_unique_asi8(self) -> bool: def get_freq(self) -> Optional[str]: """ Find the appropriate frequency string to describe the inferred - frequency of self.values + frequency of self.i8values Returns ------- @@ -392,11 +393,11 @@ def hour_deltas(self): @cache_readonly def fields(self): - return build_field_sarray(self.values) + return build_field_sarray(self.i8values) @cache_readonly def rep_stamp(self): - return Timestamp(self.values[0]) + return Timestamp(self.i8values[0]) def month_position_check(self): return libresolution.month_position_check(self.fields, self.index.dayofweek) @@ -490,6 +491,7 @@ def _is_business_daily(self) -> bool: ) def _get_wom_rule(self) -> Optional[str]: + # FIXME: dont leave commented-out # wdiffs = unique(np.diff(self.index.week)) # We also need -47, -49, -48 to catch index spanning year boundary # if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all(): diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index fe30130e87c01..8ab37f787bd10 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -157,15 +157,34 @@ class from pandas.tseries.offsets -------- >>> from pandas.tseries.holiday import Holiday, nearest_workday >>> from dateutil.relativedelta import MO - >>> USMemorialDay = Holiday('Memorial Day', month=5, day=31, - offset=pd.DateOffset(weekday=MO(-1))) - >>> USLaborDay = Holiday('Labor Day', month=9, day=1, - offset=pd.DateOffset(weekday=MO(1))) - >>> July3rd = Holiday('July 3rd', month=7, day=3,) - >>> NewYears = Holiday('New Years Day', month=1, day=1, - observance=nearest_workday), - >>> July3rd = Holiday('July 3rd', month=7, day=3, - days_of_week=(0, 1, 2, 3)) + + >>> USMemorialDay = Holiday( + ... "Memorial Day", month=5, day=31, offset=pd.DateOffset(weekday=MO(-1)) + ... ) + >>> USMemorialDay + Holiday: Memorial Day (month=5, day=31, offset=) + + >>> USLaborDay = Holiday( + ... "Labor Day", month=9, day=1, offset=pd.DateOffset(weekday=MO(1)) + ... ) + >>> USLaborDay + Holiday: Labor Day (month=9, day=1, offset=) + + >>> July3rd = Holiday("July 3rd", month=7, day=3) + >>> July3rd + Holiday: July 3rd (month=7, day=3, ) + + >>> NewYears = Holiday( + ... "New Years Day", month=1, day=1, observance=nearest_workday + ... ) + >>> NewYears # doctest: +SKIP + Holiday: New Years Day ( + month=1, day=1, observance= + ) + + >>> July3rd = Holiday("July 3rd", month=7, day=3, days_of_week=(0, 1, 2, 3)) + >>> July3rd + Holiday: July 3rd (month=7, day=3, ) """ if offset is not None and observance is not None: raise NotImplementedError("Cannot use both offset and observance.") diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index b6bbe008812cb..bc20d784c8dee 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -337,9 +337,6 @@ def apply_index(self, i): # integer addition on PeriodIndex is deprecated, # so we directly use _time_shift instead asper = i.to_period("W") - if not isinstance(asper._data, np.ndarray): - # unwrap PeriodIndex --> PeriodArray - asper = asper._data shifted = asper._time_shift(weeks) i = shifted.to_timestamp() + i.to_perioddelta("W") @@ -629,9 +626,6 @@ def apply_index(self, i): # to_period rolls forward to next BDay; track and # reduce n where it does when rolling forward asper = i.to_period("B") - if not isinstance(asper._data, np.ndarray): - # unwrap PeriodIndex --> PeriodArray - asper = asper._data if self.n > 0: shifted = (i.to_perioddelta("B") - time).asi8 != 0 @@ -1384,9 +1378,6 @@ def apply_index(self, i): # integer-array addition on PeriodIndex is deprecated, # so we use _addsub_int_array directly asper = i.to_period("M") - if not isinstance(asper._data, np.ndarray): - # unwrap PeriodIndex --> PeriodArray - asper = asper._data shifted = asper._addsub_int_array(roll // 2, operator.add) i = type(dti)(shifted.to_timestamp()) @@ -1582,9 +1573,6 @@ def apply_index(self, i): # integer addition on PeriodIndex is deprecated, # so we use _time_shift directly asper = i.to_period("W") - if not isinstance(asper._data, np.ndarray): - # unwrap PeriodIndex --> PeriodArray - asper = asper._data shifted = asper._time_shift(self.n) return shifted.to_timestamp() + i.to_perioddelta("W") @@ -1608,9 +1596,6 @@ def _end_apply_index(self, dtindex): base, mult = libfrequencies.get_freq_code(self.freqstr) base_period = dtindex.to_period(base) - if not isinstance(base_period._data, np.ndarray): - # unwrap PeriodIndex --> PeriodArray - base_period = base_period._data if self.n > 0: # when adding, dates on end roll to next diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index d854be062fcbb..17815c437249b 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -1,24 +1,11 @@ from functools import wraps import inspect from textwrap import dedent -from typing import ( - Any, - Callable, - List, - Mapping, - Optional, - Tuple, - Type, - TypeVar, - Union, - cast, -) +from typing import Any, Callable, List, Mapping, Optional, Tuple, Type, Union, cast import warnings from pandas._libs.properties import cache_readonly # noqa - -FuncType = Callable[..., Any] -F = TypeVar("F", bound=FuncType) +from pandas._typing import F def deprecate( @@ -29,7 +16,7 @@ def deprecate( klass: Optional[Type[Warning]] = None, stacklevel: int = 2, msg: Optional[str] = None, -) -> Callable[..., Any]: +) -> Callable[[F], F]: """ Return a new function that emits a deprecation warning on use. @@ -100,7 +87,7 @@ def deprecate_kwarg( new_arg_name: Optional[str], mapping: Optional[Union[Mapping[Any, Any], Callable[[Any], Any]]] = None, stacklevel: int = 2, -) -> Callable[..., Any]: +) -> Callable[[F], F]: """ Decorator to deprecate a keyword argument of a function. @@ -216,6 +203,105 @@ def wrapper(*args, **kwargs) -> Callable[..., Any]: return _deprecate_kwarg +def _format_argument_list(allow_args: Union[List[str], int]): + """ + Convert the allow_args argument (either string or integer) of + `deprecate_nonkeyword_arguments` function to a string describing + it to be inserted into warning message. + + Parameters + ---------- + allowed_args : list, tuple or int + The `allowed_args` argument for `deprecate_nonkeyword_arguments`, + but None value is not allowed. + + Returns + ------- + s : str + The substring describing the argument list in best way to be + inserted to the warning message. + + Examples + -------- + `format_argument_list(0)` -> '' + `format_argument_list(1)` -> 'except for the first argument' + `format_argument_list(2)` -> 'except for the first 2 arguments' + `format_argument_list([])` -> '' + `format_argument_list(['a'])` -> "except for the arguments 'a'" + `format_argument_list(['a', 'b'])` -> "except for the arguments 'a' and 'b'" + `format_argument_list(['a', 'b', 'c'])` -> + "except for the arguments 'a', 'b' and 'c'" + """ + if not allow_args: + return "" + elif allow_args == 1: + return " except for the first argument" + elif isinstance(allow_args, int): + return " except for the first {num_args} arguments".format(num_args=allow_args) + elif len(allow_args) == 1: + return " except for the argument '{arg}'".format(arg=allow_args[0]) + else: + last = allow_args[-1] + args = ", ".join(["'" + x + "'" for x in allow_args[:-1]]) + return " except for the arguments {args} and '{last}'".format( + args=args, last=last + ) + + +def deprecate_nonkeyword_arguments( + version: str, + allowed_args: Optional[Union[List[str], int]] = None, + stacklevel: int = 2, +) -> Callable: + """ + Decorator to deprecate a use of non-keyword arguments of a function. + + Parameters + ---------- + version : str + The version in which positional arguments will become + keyword-only. + + allowed_args : list or int, optional + In case of list, it must be the list of names of some + first arguments of the decorated functions that are + OK to be given as positional arguments. In case of an + integer, this is the number of positional arguments + that will stay positional. In case of None value, + defaults to list of all arguments not having the + default value. + + stacklevel : int, default=2 + The stack level for warnings.warn + """ + + def decorate(func): + if allowed_args is not None: + allow_args = allowed_args + else: + spec = inspect.getfullargspec(func) + allow_args = spec.args[: -len(spec.defaults)] + + @wraps(func) + def wrapper(*args, **kwargs): + arguments = _format_argument_list(allow_args) + if isinstance(allow_args, (list, tuple)): + num_allow_args = len(allow_args) + else: + num_allow_args = allow_args + if len(args) > num_allow_args: + msg = ( + "Starting with Pandas version {version} all arguments of {funcname}" + "{except_args} will be keyword-only" + ).format(version=version, funcname=func.__name__, except_args=arguments) + warnings.warn(msg, FutureWarning, stacklevel=stacklevel) + return func(*args, **kwargs) + + return wrapper + + return decorate + + def rewrite_axis_style_signature( name: str, extra_params: List[Tuple[str, Any]] ) -> Callable[..., Any]: @@ -250,9 +336,11 @@ def doc(*args: Union[str, Callable], **kwargs: str) -> Callable[[F], F]: A decorator take docstring templates, concatenate them and perform string substitution on it. - This decorator is robust even if func.__doc__ is None. This decorator will - add a variable "_docstr_template" to the wrapped function to save original - docstring template for potential usage. + This decorator will add a variable "_docstring_components" to the wrapped + function to keep track the original docstring template for potential usage. + If it should be consider as a template, it will be saved as a string. + Otherwise, it will be saved as callable, and later user __doc__ and dedent + to get docstring. Parameters ---------- @@ -268,17 +356,28 @@ def decorator(func: F) -> F: def wrapper(*args, **kwargs) -> Callable: return func(*args, **kwargs) - templates = [func.__doc__ if func.__doc__ else ""] + # collecting docstring and docstring templates + docstring_components: List[Union[str, Callable]] = [] + if func.__doc__: + docstring_components.append(dedent(func.__doc__)) + for arg in args: - if isinstance(arg, str): - templates.append(arg) - elif hasattr(arg, "_docstr_template"): - templates.append(arg._docstr_template) # type: ignore - elif arg.__doc__: - templates.append(arg.__doc__) - - wrapper._docstr_template = "".join(dedent(t) for t in templates) # type: ignore - wrapper.__doc__ = wrapper._docstr_template.format(**kwargs) # type: ignore + if hasattr(arg, "_docstring_components"): + docstring_components.extend(arg._docstring_components) # type: ignore + elif isinstance(arg, str) or arg.__doc__: + docstring_components.append(arg) + + # formatting templates and concatenating docstring + wrapper.__doc__ = "".join( + [ + arg.format(**kwargs) + if isinstance(arg, str) + else dedent(arg.__doc__ or "") + for arg in docstring_components + ] + ) + + wrapper._docstring_components = docstring_components # type: ignore return cast(F, wrapper) diff --git a/pandas/util/_doctools.py b/pandas/util/_doctools.py index 8fd4566d7763b..71965b8e7dd9d 100644 --- a/pandas/util/_doctools.py +++ b/pandas/util/_doctools.py @@ -126,7 +126,7 @@ def _insert_index(self, data): if col_nlevels > 1: col = data.columns._get_level_values(0) values = [ - data.columns._get_level_values(i).values for i in range(1, col_nlevels) + data.columns._get_level_values(i)._values for i in range(1, col_nlevels) ] col_df = pd.DataFrame(values) data.columns = col_df.columns diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index f9502cc22b0c6..72003eeddf5ee 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -5,8 +5,9 @@ import platform import struct import sys -from typing import List, Optional, Tuple, Union +from typing import Dict, Optional, Union +from pandas._typing import JSONSerializable from pandas.compat._optional import VERSIONS, _get_version, import_optional_dependency @@ -21,43 +22,32 @@ def _get_commit_hash() -> Optional[str]: return versions["full-revisionid"] -def get_sys_info() -> List[Tuple[str, Optional[Union[str, int]]]]: +def _get_sys_info() -> Dict[str, JSONSerializable]: """ - Returns system information as a list + Returns system information as a JSON serializable dictionary. + """ + uname_result = platform.uname() + language_code, encoding = locale.getlocale() + return { + "commit": _get_commit_hash(), + "python": ".".join(str(i) for i in sys.version_info), + "python-bits": struct.calcsize("P") * 8, + "OS": uname_result.system, + "OS-release": uname_result.release, + "Version": uname_result.version, + "machine": uname_result.machine, + "processor": uname_result.processor, + "byteorder": sys.byteorder, + "LC_ALL": os.environ.get("LC_ALL"), + "LANG": os.environ.get("LANG"), + "LOCALE": {"language-code": language_code, "encoding": encoding}, + } + + +def _get_dependency_info() -> Dict[str, JSONSerializable]: + """ + Returns dependency information as a JSON serializable dictionary. """ - blob: List[Tuple[str, Optional[Union[str, int]]]] = [] - - # get full commit hash - commit = _get_commit_hash() - - blob.append(("commit", commit)) - - try: - (sysname, nodename, release, version, machine, processor) = platform.uname() - blob.extend( - [ - ("python", ".".join(map(str, sys.version_info))), - ("python-bits", struct.calcsize("P") * 8), - ("OS", f"{sysname}"), - ("OS-release", f"{release}"), - # FIXME: dont leave commented-out - # ("Version", f"{version}"), - ("machine", f"{machine}"), - ("processor", f"{processor}"), - ("byteorder", f"{sys.byteorder}"), - ("LC_ALL", f"{os.environ.get('LC_ALL', 'None')}"), - ("LANG", f"{os.environ.get('LANG', 'None')}"), - ("LOCALE", ".".join(map(str, locale.getlocale()))), - ] - ) - except (KeyError, ValueError): - pass - - return blob - - -def show_versions(as_json=False): - sys_info = get_sys_info() deps = [ "pandas", # required @@ -86,39 +76,59 @@ def show_versions(as_json=False): "IPython", "pandas_datareader", ] - deps.extend(list(VERSIONS)) - deps_blob = [] + result: Dict[str, JSONSerializable] = {} for modname in deps: mod = import_optional_dependency( modname, raise_on_missing=False, on_version="ignore" ) - ver: Optional[str] - if mod: - ver = _get_version(mod) - else: - ver = None - deps_blob.append((modname, ver)) + result[modname] = _get_version(mod) if mod else None + return result + + +def show_versions(as_json: Union[str, bool] = False) -> None: + """ + Provide useful information, important for bug reports. + + It comprises info about hosting operation system, pandas version, + and versions of other installed relative packages. + + Parameters + ---------- + as_json : str or bool, default False + * If False, outputs info in a human readable form to the console. + * If str, it will be considered as a path to a file. + Info will be written to that file in JSON format. + * If True, outputs info in JSON format to the console. + """ + sys_info = _get_sys_info() + deps = _get_dependency_info() if as_json: - j = dict(system=dict(sys_info), dependencies=dict(deps_blob)) + j = dict(system=sys_info, dependencies=deps) if as_json is True: print(j) else: + assert isinstance(as_json, str) # needed for mypy with codecs.open(as_json, "wb", encoding="utf8") as f: json.dump(j, f, indent=2) else: + assert isinstance(sys_info["LOCALE"], dict) # needed for mypy + language_code = sys_info["LOCALE"]["language-code"] + encoding = sys_info["LOCALE"]["encoding"] + sys_info["LOCALE"] = f"{language_code}.{encoding}" + maxlen = max(len(x) for x in deps) print("\nINSTALLED VERSIONS") print("------------------") - for k, stat in sys_info: - print(f"{k:<{maxlen}}: {stat}") + for k, v in sys_info.items(): + print(f"{k:<{maxlen}}: {v}") print("") - for k, stat in deps_blob: - print(f"{k:<{maxlen}}: {stat}") + for k, v in deps.items(): + print(f"{k:<{maxlen}}: {v}") def main() -> int: diff --git a/pyproject.toml b/pyproject.toml index 28d7c3d55c919..696785599d7da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ requires = [ "setuptools", "wheel", - "Cython>=0.29.13", # Note: sync with setup.py + "Cython>=0.29.16", # Note: sync with setup.py "numpy==1.13.3; python_version=='3.6' and platform_system!='AIX'", "numpy==1.14.5; python_version>='3.7' and platform_system!='AIX'", "numpy==1.16.0; python_version=='3.6' and platform_system=='AIX'", diff --git a/requirements-dev.txt b/requirements-dev.txt index a469cbdd93ceb..5cef428d35452 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,10 +2,10 @@ # See that file for comments about the need/usage of each dependency. numpy>=1.15 -python-dateutil>=2.6.1 +python-dateutil>=2.7.3 pytz asv -cython>=0.29.13 +cython>=0.29.16 black==19.10b0 cpplint flake8 @@ -55,7 +55,7 @@ numba>=0.46.0 beautifulsoup4>=4.6.0 html5lib lxml -openpyxl<=3.0.1 +openpyxl xlrd xlsxwriter xlwt @@ -68,7 +68,8 @@ tables>=3.4.2 s3fs sqlalchemy xarray +cftime pyreadstat tabulate>=0.8.3 -git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master +git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master git+https://github.com/numpy/numpydoc \ No newline at end of file diff --git a/scripts/tests/test_validate_unwanted_patterns.py b/scripts/tests/test_validate_unwanted_patterns.py new file mode 100644 index 0000000000000..b6cfa20cd7ca0 --- /dev/null +++ b/scripts/tests/test_validate_unwanted_patterns.py @@ -0,0 +1,419 @@ +import io + +import pytest + +import validate_unwanted_patterns + + +class TestBarePytestRaises: + @pytest.mark.parametrize( + "data", + [ + ( + """ + with pytest.raises(ValueError, match="foo"): + pass + """ + ), + ( + """ + # with pytest.raises(ValueError, match="foo"): + # pass + """ + ), + ( + """ + # with pytest.raises(ValueError): + # pass + """ + ), + ( + """ + with pytest.raises( + ValueError, + match="foo" + ): + pass + """ + ), + ], + ) + def test_pytest_raises(self, data): + fd = io.StringIO(data.strip()) + result = list(validate_unwanted_patterns.bare_pytest_raises(fd)) + assert result == [] + + @pytest.mark.parametrize( + "data, expected", + [ + ( + ( + """ + with pytest.raises(ValueError): + pass + """ + ), + [ + ( + 1, + ( + "Bare pytests raise have been found. " + "Please pass in the argument 'match' " + "as well the exception." + ), + ), + ], + ), + ( + ( + """ + with pytest.raises(ValueError, match="foo"): + with pytest.raises(ValueError): + pass + pass + """ + ), + [ + ( + 2, + ( + "Bare pytests raise have been found. " + "Please pass in the argument 'match' " + "as well the exception." + ), + ), + ], + ), + ( + ( + """ + with pytest.raises(ValueError): + with pytest.raises(ValueError, match="foo"): + pass + pass + """ + ), + [ + ( + 1, + ( + "Bare pytests raise have been found. " + "Please pass in the argument 'match' " + "as well the exception." + ), + ), + ], + ), + ( + ( + """ + with pytest.raises( + ValueError + ): + pass + """ + ), + [ + ( + 1, + ( + "Bare pytests raise have been found. " + "Please pass in the argument 'match' " + "as well the exception." + ), + ), + ], + ), + ( + ( + """ + with pytest.raises( + ValueError, + # match = "foo" + ): + pass + """ + ), + [ + ( + 1, + ( + "Bare pytests raise have been found. " + "Please pass in the argument 'match' " + "as well the exception." + ), + ), + ], + ), + ], + ) + def test_pytest_raises_raises(self, data, expected): + fd = io.StringIO(data.strip()) + result = list(validate_unwanted_patterns.bare_pytest_raises(fd)) + assert result == expected + + +@pytest.mark.parametrize( + "data, expected", + [ + ( + 'msg = ("bar " "baz")', + [ + ( + 1, + ( + "String unnecessarily split in two by black. " + "Please merge them manually." + ), + ) + ], + ), + ( + 'msg = ("foo " "bar " "baz")', + [ + ( + 1, + ( + "String unnecessarily split in two by black. " + "Please merge them manually." + ), + ), + ( + 1, + ( + "String unnecessarily split in two by black. " + "Please merge them manually." + ), + ), + ], + ), + ], +) +def test_strings_to_concatenate(data, expected): + fd = io.StringIO(data.strip()) + result = list(validate_unwanted_patterns.strings_to_concatenate(fd)) + assert result == expected + + +class TestStringsWithWrongPlacedWhitespace: + @pytest.mark.parametrize( + "data", + [ + ( + """ + msg = ( + "foo\n" + " bar" + ) + """ + ), + ( + """ + msg = ( + "foo" + " bar" + "baz" + ) + """ + ), + ( + """ + msg = ( + f"foo" + " bar" + ) + """ + ), + ( + """ + msg = ( + "foo" + f" bar" + ) + """ + ), + ( + """ + msg = ( + "foo" + rf" bar" + ) + """ + ), + ], + ) + def test_strings_with_wrong_placed_whitespace(self, data): + fd = io.StringIO(data.strip()) + result = list( + validate_unwanted_patterns.strings_with_wrong_placed_whitespace(fd) + ) + assert result == [] + + @pytest.mark.parametrize( + "data, expected", + [ + ( + ( + """ + msg = ( + "foo" + " bar" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ) + ], + ), + ( + ( + """ + msg = ( + f"foo" + " bar" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ) + ], + ), + ( + ( + """ + msg = ( + "foo" + f" bar" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ) + ], + ), + ( + ( + """ + msg = ( + f"foo" + f" bar" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ) + ], + ), + ( + ( + """ + msg = ( + "foo" + rf" bar" + " baz" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ), + ( + 4, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ), + ], + ), + ( + ( + """ + msg = ( + "foo" + " bar" + rf" baz" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ), + ( + 4, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ), + ], + ), + ( + ( + """ + msg = ( + "foo" + rf" bar" + rf" baz" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ), + ( + 4, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ), + ], + ), + ], + ) + def test_strings_with_wrong_placed_whitespace_raises(self, data, expected): + fd = io.StringIO(data.strip()) + result = list( + validate_unwanted_patterns.strings_with_wrong_placed_whitespace(fd) + ) + assert result == expected diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py new file mode 100755 index 0000000000000..907db4ab4c7ce --- /dev/null +++ b/scripts/validate_rst_title_capitalization.py @@ -0,0 +1,277 @@ +#!/usr/bin/env python +""" +Validate that the titles in the rst files follow the proper capitalization convention. + +Print the titles that do not follow the convention. + +Usage:: +./scripts/validate_rst_title_capitalization.py doc/source/development/contributing.rst +./scripts/validate_rst_title_capitalization.py doc/source/ + +""" +import argparse +import glob +import os +import re +import sys +from typing import Iterable, List, Tuple + +CAPITALIZATION_EXCEPTIONS = { + "pandas", + "Python", + "IPython", + "PyTables", + "Excel", + "JSON", + "HTML", + "SAS", + "SQL", + "BigQuery", + "STATA", + "Interval", + "PEP8", + "Period", + "Series", + "Index", + "DataFrame", + "C", + "Git", + "GitHub", + "NumPy", + "Apache", + "Arrow", + "Parquet", + "MultiIndex", + "NumFOCUS", + "sklearn", + "Docker", + "PeriodIndex", + "NA", + "NaN", + "ValueError", + "BooleanArray", + "KeyError", + "API", + "FAQ", + "IO", + "TimedeltaIndex", + "DatetimeIndex", + "IntervalIndex", + "CategoricalIndex", + "GroupBy", + "SPSS", + "ORC", + "R", + "HDF5", + "HDFStore", + "CDay", + "CBMonthBegin", + "CBMonthEnd", + "BMonthBegin", + "BMonthEnd", + "BDay", + "FY5253Quarter", + "FY5253", + "YearBegin", + "YearEnd", + "BYearBegin", + "BYearEnd", + "YearOffset", + "QuarterBegin", + "QuarterEnd", + "BQuarterBegin", + "BQuarterEnd", + "QuarterOffset", + "LastWeekOfMonth", + "WeekOfMonth", + "SemiMonthBegin", + "SemiMonthEnd", + "SemiMonthOffset", + "CustomBusinessMonthBegin", + "CustomBusinessMonthEnd", + "BusinessMonthBegin", + "BusinessMonthEnd", + "MonthBegin", + "MonthEnd", + "MonthOffset", + "CustomBusinessHour", + "CustomBusinessDay", + "BusinessHour", + "BusinessDay", + "DateOffset", + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December", + "Float64Index", +} + +CAP_EXCEPTIONS_DICT = {word.lower(): word for word in CAPITALIZATION_EXCEPTIONS} + +err_msg = "Heading capitalization formatted incorrectly. Please correctly capitalize" + +symbols = ("*", "=", "-", "^", "~", "#", '"') + + +def correct_title_capitalization(title: str) -> str: + """ + Algorithm to create the correct capitalization for a given title. + + Parameters + ---------- + title : str + Heading string to correct. + + Returns + ------- + str + Correctly capitalized heading. + """ + + # Skip modification no matter what if title begins by ":" to exclude specific + # syntax that is needed to build links. + if title[0] == ":": + return title + + # Strip all non-word characters from the beginning of the title to the + # first word character. + correct_title: str = re.sub(r"^\W*", "", title).capitalize() + + # Remove a URL from the title. We do this because words in a URL must + # stay lowercase, even if they are a capitalization exception. + removed_https_title = re.sub(r"", "", correct_title) + + # Split a title into a list using non-word character delimiters. + word_list = re.split(r"\W", removed_https_title) + + for word in word_list: + if word.lower() in CAP_EXCEPTIONS_DICT: + correct_title = re.sub( + rf"\b{word}\b", CAP_EXCEPTIONS_DICT[word.lower()], correct_title + ) + + return correct_title + + +def find_titles(rst_file: str) -> Iterable[Tuple[str, int]]: + """ + Algorithm to identify particular text that should be considered headings in an + RST file. + + See for details + on what constitutes a string as a heading in RST. + + Parameters + ---------- + rst_file : str + RST file to scan through for headings. + + Yields + ------- + title : str + A heading found in the rst file. + + line_number : int + The corresponding line number of the heading. + """ + + with open(rst_file, "r") as fd: + previous_line = "" + for i, line in enumerate(fd): + line = line[:-1] + line_chars = set(line) + if ( + len(line_chars) == 1 + and line_chars.pop() in symbols + and len(line) == len(previous_line) + ): + yield re.sub(r"[`\*_]", "", previous_line), i + previous_line = line + + +def find_rst_files(source_paths: List[str]) -> Iterable[str]: + """ + Given the command line arguments of directory paths, this method + yields the strings of the .rst file directories that these paths contain. + + Parameters + ---------- + source_paths : str + List of directories to validate, provided through command line arguments. + + Yields + ------- + str + Directory address of a .rst files found in command line argument directories. + """ + + for directory_address in source_paths: + if not os.path.exists(directory_address): + raise ValueError( + "Please enter a valid path, pointing to a valid file/directory." + ) + elif directory_address.endswith(".rst"): + yield directory_address + else: + for filename in glob.glob( + pathname=f"{directory_address}/**/*.rst", recursive=True + ): + yield filename + + +def main(source_paths: List[str], output_format: str) -> int: + """ + The main method to print all headings with incorrect capitalization. + + Parameters + ---------- + source_paths : str + List of directories to validate, provided through command line arguments. + output_format : str + Output format of the script. + + Returns + ------- + int + Number of incorrect headings found overall. + """ + + number_of_errors: int = 0 + + for filename in find_rst_files(source_paths): + for title, line_number in find_titles(filename): + if title != correct_title_capitalization(title): + print( + f"""{filename}:{line_number}:{err_msg} "{title}" to "{ + correct_title_capitalization(title)}" """ + ) + number_of_errors += 1 + + return number_of_errors + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Validate heading capitalization") + + parser.add_argument( + "paths", nargs="+", default=".", help="Source paths of file/directory to check." + ) + + parser.add_argument( + "--format", + "-f", + default="{source_path}:{line_number}:{msg}:{heading}:{correct_heading}", + help="Output format of incorrectly capitalized titles", + ) + + args = parser.parse_args() + + sys.exit(main(args.paths, args.format)) diff --git a/scripts/validate_string_concatenation.py b/scripts/validate_string_concatenation.py deleted file mode 100755 index fbf3bb5cfccf2..0000000000000 --- a/scripts/validate_string_concatenation.py +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/env python3 -""" -GH #30454 - -Check where there is a string that needs to be concatenated. - -This is necessary after black formating, -where for example black transforms this: - ->>> foo = ( -... "bar " -... "baz" -... ) - -into this: - ->>> foo = ("bar " "baz") - -Black is not considering this as an -issue (see issue https://github.com/psf/black/issues/1051), -so we are checking it here. -""" - -import argparse -import os -import sys -import token -import tokenize -from typing import Generator, List, Tuple - -FILE_EXTENSIONS_TO_CHECK = (".py", ".pyx", ".pyx.ini", ".pxd") - - -def main(source_path: str, output_format: str) -> bool: - """ - Main entry point of the script. - - Parameters - ---------- - source_path : str - Source path representing path to a file/directory. - output_format : str - Output format of the script. - - Returns - ------- - bool - True if found any strings that needs to be concatenated. - - Raises - ------ - ValueError - If the `source_path` is not pointing to existing file/directory. - """ - if not os.path.exists(source_path): - raise ValueError( - "Please enter a valid path, pointing to a valid file/directory." - ) - - is_failed: bool = False - - msg = "String unnecessarily split in two by black. Please merge them manually." - - if os.path.isfile(source_path): - for source_path, line_number in strings_to_concatenate(source_path): - is_failed = True - print( - output_format.format( - source_path=source_path, line_number=line_number, msg=msg - ) - ) - - for subdir, _, files in os.walk(source_path): - for file_name in files: - if any( - file_name.endswith(extension) for extension in FILE_EXTENSIONS_TO_CHECK - ): - for source_path, line_number in strings_to_concatenate( - os.path.join(subdir, file_name) - ): - is_failed = True - print( - output_format.format( - source_path=source_path, line_number=line_number, msg=msg - ) - ) - return is_failed - - -def strings_to_concatenate(source_path: str) -> Generator[Tuple[str, int], None, None]: - """ - Yielding the strings that needs to be concatenated in a given file. - - Parameters - ---------- - source_path : str - File path pointing to a single file. - - Yields - ------ - source_path : str - Source file path. - line_number : int - Line number of unconcatenated string. - """ - with open(source_path, "r") as file_name: - tokens: List = list(tokenize.generate_tokens(file_name.readline)) - - for current_token, next_token in zip(tokens, tokens[1:]): - if current_token[0] == next_token[0] == token.STRING: - yield source_path, current_token[2][0] - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Validate concatenated strings") - - parser.add_argument( - "path", nargs="?", default=".", help="Source path of file/directory to check." - ) - parser.add_argument( - "--format", - "-f", - default="{source_path}:{line_number}:{msg}", - help="Output format of the unconcatenated strings.", - ) - - args = parser.parse_args() - - sys.exit(main(source_path=args.path, output_format=args.format)) diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py new file mode 100755 index 0000000000000..193fef026a96b --- /dev/null +++ b/scripts/validate_unwanted_patterns.py @@ -0,0 +1,402 @@ +#!/usr/bin/env python3 +""" +Unwanted patterns test cases. + +The reason this file exist despite the fact we already have +`ci/code_checks.sh`, +(see https://github.com/pandas-dev/pandas/blob/master/ci/code_checks.sh) + +is that some of the test cases are more complex/imposible to validate via regex. +So this file is somewhat an extensions to `ci/code_checks.sh` +""" + +import argparse +import ast +import os +import sys +import token +import tokenize +from typing import IO, Callable, FrozenSet, Iterable, List, Tuple + +PATHS_TO_IGNORE: Tuple[str, ...] = ("asv_bench/env",) + + +def _get_literal_string_prefix_len(token_string: str) -> int: + """ + Getting the length of the literal string prefix. + + Parameters + ---------- + token_string : str + String to check. + + Returns + ------- + int + Length of the literal string prefix. + + Examples + -------- + >>> example_string = "'Hello world'" + >>> _get_literal_string_prefix_len(example_string) + 0 + >>> example_string = "r'Hello world'" + >>> _get_literal_string_prefix_len(example_string) + 1 + """ + try: + return min( + token_string.find(quote) + for quote in (r"'", r'"') + if token_string.find(quote) >= 0 + ) + except ValueError: + return 0 + + +def bare_pytest_raises(file_obj: IO[str]) -> Iterable[Tuple[int, str]]: + """ + Test Case for bare pytest raises. + + For example, this is wrong: + + >>> with pytest.raise(ValueError): + ... # Some code that raises ValueError + + And this is what we want instead: + + >>> with pytest.raise(ValueError, match="foo"): + ... # Some code that raises ValueError + + Parameters + ---------- + file_obj : IO + File-like object containing the Python code to validate. + + Yields + ------ + line_number : int + Line number of unconcatenated string. + msg : str + Explenation of the error. + + Notes + ----- + GH #23922 + """ + contents = file_obj.read() + tree = ast.parse(contents) + + for node in ast.walk(tree): + if not isinstance(node, ast.Call): + continue + + try: + if not (node.func.value.id == "pytest" and node.func.attr == "raises"): + continue + except AttributeError: + continue + + if not node.keywords: + yield ( + node.lineno, + "Bare pytests raise have been found. " + "Please pass in the argument 'match' as well the exception.", + ) + else: + # Means that there are arguments that are being passed in, + # now we validate that `match` is one of the passed in arguments + if not any(keyword.arg == "match" for keyword in node.keywords): + yield ( + node.lineno, + "Bare pytests raise have been found. " + "Please pass in the argument 'match' as well the exception.", + ) + + +def strings_to_concatenate(file_obj: IO[str]) -> Iterable[Tuple[int, str]]: + """ + This test case is necessary after 'Black' (https://github.com/psf/black), + is formating strings over multiple lines. + + For example, when this: + + >>> foo = ( + ... "bar " + ... "baz" + ... ) + + Is becoming this: + + >>> foo = ("bar " "baz") + + 'Black' is not considering this as an + issue (see https://github.com/psf/black/issues/1051), + so we are checking it here instead. + + Parameters + ---------- + file_obj : IO + File-like object containing the Python code to validate. + + Yields + ------ + line_number : int + Line number of unconcatenated string. + msg : str + Explenation of the error. + + Notes + ----- + GH #30454 + """ + tokens: List = list(tokenize.generate_tokens(file_obj.readline)) + + for current_token, next_token in zip(tokens, tokens[1:]): + if current_token.type == next_token.type == token.STRING: + yield ( + current_token.start[0], + ( + "String unnecessarily split in two by black. " + "Please merge them manually." + ), + ) + + +def strings_with_wrong_placed_whitespace( + file_obj: IO[str], +) -> Iterable[Tuple[int, str]]: + """ + Test case for leading spaces in concated strings. + + For example: + + >>> rule = ( + ... "We want the space at the end of the line, " + ... "not at the beginning" + ... ) + + Instead of: + + >>> rule = ( + ... "We want the space at the end of the line," + ... " not at the beginning" + ... ) + + Parameters + ---------- + file_obj : IO + File-like object containing the Python code to validate. + + Yields + ------ + line_number : int + Line number of unconcatenated string. + msg : str + Explenation of the error. + """ + + def has_wrong_whitespace(first_line: str, second_line: str) -> bool: + """ + Checking if the two lines are mattching the unwanted pattern. + + Parameters + ---------- + first_line : str + First line to check. + second_line : str + Second line to check. + + Returns + ------- + bool + True if the two recived string match, an unwanted pattern. + + Notes + ----- + The unwanted pattern that we are trying to catch is if the spaces in + a string that is concatenated over multiple lines are placed at the + end of each string, unless this string is ending with a + newline character (\n). + + For example, this is bad: + + >>> rule = ( + ... "We want the space at the end of the line," + ... " not at the beginning" + ... ) + + And what we want is: + + >>> rule = ( + ... "We want the space at the end of the line, " + ... "not at the beginning" + ... ) + + And if the string is ending with a new line character (\n) we + do not want any trailing whitespaces after it. + + For example, this is bad: + + >>> rule = ( + ... "We want the space at the begging of " + ... "the line if the previous line is ending with a \n " + ... "not at the end, like always" + ... ) + + And what we do want is: + + >>> rule = ( + ... "We want the space at the begging of " + ... "the line if the previous line is ending with a \n" + ... " not at the end, like always" + ... ) + """ + if first_line.endswith(r"\n"): + return False + elif first_line.startswith(" ") or second_line.startswith(" "): + return False + elif first_line.endswith(" ") or second_line.endswith(" "): + return False + elif (not first_line.endswith(" ")) and second_line.startswith(" "): + return True + return False + + tokens: List = list(tokenize.generate_tokens(file_obj.readline)) + + for first_token, second_token, third_token in zip(tokens, tokens[1:], tokens[2:]): + # Checking if we are in a block of concated string + if ( + first_token.type == third_token.type == token.STRING + and second_token.type == token.NL + ): + # Striping the quotes, with the string litteral prefix + first_string: str = first_token.string[ + _get_literal_string_prefix_len(first_token.string) + 1 : -1 + ] + second_string: str = third_token.string[ + _get_literal_string_prefix_len(third_token.string) + 1 : -1 + ] + + if has_wrong_whitespace(first_string, second_string): + yield ( + third_token.start[0], + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ) + + +def main( + function: Callable[[IO[str]], Iterable[Tuple[int, str]]], + source_path: str, + output_format: str, + file_extensions_to_check: str, +) -> bool: + """ + Main entry point of the script. + + Parameters + ---------- + function : Callable + Function to execute for the specified validation type. + source_path : str + Source path representing path to a file/directory. + output_format : str + Output format of the error message. + + Returns + ------- + bool + True if found any patterns are found related to the given function. + + Raises + ------ + ValueError + If the `source_path` is not pointing to existing file/directory. + """ + if not os.path.exists(source_path): + raise ValueError("Please enter a valid path, pointing to a file/directory.") + + is_failed: bool = False + file_path: str = "" + + FILE_EXTENSIONS_TO_CHECK: FrozenSet[str] = frozenset( + file_extensions_to_check.split(",") + ) + + if os.path.isfile(source_path): + file_path = source_path + with open(file_path, "r") as file_obj: + for line_number, msg in function(file_obj): + is_failed = True + print( + output_format.format( + source_path=file_path, line_number=line_number, msg=msg + ) + ) + + for subdir, _, files in os.walk(source_path): + if any(path in subdir for path in PATHS_TO_IGNORE): + continue + for file_name in files: + if not any( + file_name.endswith(extension) for extension in FILE_EXTENSIONS_TO_CHECK + ): + continue + + file_path = os.path.join(subdir, file_name) + with open(file_path, "r") as file_obj: + for line_number, msg in function(file_obj): + is_failed = True + print( + output_format.format( + source_path=file_path, line_number=line_number, msg=msg + ) + ) + + return is_failed + + +if __name__ == "__main__": + available_validation_types: List[str] = [ + "bare_pytest_raises", + "strings_to_concatenate", + "strings_with_wrong_placed_whitespace", + ] + + parser = argparse.ArgumentParser(description="Unwanted patterns checker.") + + parser.add_argument( + "path", nargs="?", default=".", help="Source path of file/directory to check." + ) + parser.add_argument( + "--format", + "-f", + default="{source_path}:{line_number}:{msg}", + help="Output format of the error message.", + ) + parser.add_argument( + "--validation-type", + "-vt", + choices=available_validation_types, + required=True, + help="Validation test case to check.", + ) + parser.add_argument( + "--included-file-extensions", + default="py,pyx,pxd,pxi", + help="Coma seperated file extensions to check.", + ) + + args = parser.parse_args() + + sys.exit( + main( + function=globals().get(args.validation_type), # type: ignore + source_path=args.path, + output_format=args.format, + file_extensions_to_check=args.included_file_extensions, + ) + ) diff --git a/setup.cfg b/setup.cfg index 61d5b1030a500..6c42b27c7b015 100644 --- a/setup.cfg +++ b/setup.cfg @@ -60,7 +60,7 @@ markers = db: tests requiring a database (mysql or postgres) high_memory: mark a test as a high-memory only clipboard: mark a pd.read_clipboard test -doctest_optionflags = NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL +doctest_optionflags = NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL ELLIPSIS addopts = --strict-data-files xfail_strict = True filterwarnings = @@ -98,6 +98,7 @@ exclude_lines = # Don't complain if non-runnable code isn't run: if 0: if __name__ == .__main__.: + if TYPE_CHECKING: [coverage:html] directory = coverage_html_report @@ -125,6 +126,8 @@ ignore_missing_imports=True no_implicit_optional=True check_untyped_defs=True strict_equality=True +warn_redundant_casts = True +warn_unused_ignores = True [mypy-pandas.tests.*] check_untyped_defs=False @@ -135,7 +138,7 @@ ignore_errors=True [mypy-pandas.tests.arithmetic.test_datetime64] ignore_errors=True -[mypy-pandas.tests.indexes.datetimes.test_tools] +[mypy-pandas.tests.tools.test_to_datetime] ignore_errors=True [mypy-pandas.tests.scalar.period.test_period] @@ -195,9 +198,6 @@ check_untyped_defs=False [mypy-pandas.core.indexes.multi] check_untyped_defs=False -[mypy-pandas.core.indexing] -check_untyped_defs=False - [mypy-pandas.core.internals.blocks] check_untyped_defs=False diff --git a/setup.py b/setup.py index 2d49d7e1e85f2..a2e01e08e8de2 100755 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def is_platform_mac(): min_numpy_ver = "1.13.3" -min_cython_ver = "0.29.13" # note: sync with pyproject.toml +min_cython_ver = "0.29.16" # note: sync with pyproject.toml try: import Cython @@ -433,8 +433,7 @@ def run(self): extra_compile_args.append("/Z7") extra_link_args.append("/DEBUG") else: - # args to ignore warnings - extra_compile_args = [] + extra_compile_args = ["-Werror"] extra_link_args = [] if debugging_symbols_requested: extra_compile_args.append("-g") @@ -455,6 +454,9 @@ def run(self): ): os.environ["MACOSX_DEPLOYMENT_TARGET"] = "10.9" + if sys.version_info[:2] == (3, 8): # GH 33239 + extra_compile_args.append("-Wno-error=deprecated-declarations") + # enable coverage by building cython files by setting the environment variable # "PANDAS_CYTHON_COVERAGE" (with a Truthy value) or by running build_ext # with `--with-cython-coverage`enabled @@ -477,6 +479,14 @@ def run(self): # we can't do anything about these warnings because they stem from # cython+numpy version mismatches. macros.append(("NPY_NO_DEPRECATED_API", "0")) +if "-Werror" in extra_compile_args: + try: + import numpy as np + except ImportError: + pass + else: + if np.__version__ < LooseVersion("1.16.0"): + extra_compile_args.remove("-Werror") # ---------------------------------------------------------------------- @@ -737,7 +747,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): def setup_package(): setuptools_kwargs = { "install_requires": [ - "python-dateutil >= 2.6.1", + "python-dateutil >= 2.7.3", "pytz >= 2017.2", f"numpy >= {min_numpy_ver}", ], @@ -750,7 +760,7 @@ def setup_package(): maintainer=AUTHOR, version=versioneer.get_version(), packages=find_packages(include=["pandas", "pandas.*"]), - package_data={"": ["templates/*", "_libs/*.dll"]}, + package_data={"": ["templates/*", "_libs/**/*.dll"]}, ext_modules=maybe_cythonize(extensions, compiler_directives=directives), maintainer_email=EMAIL, description=DESCRIPTION, diff --git a/web/pandas/about/citing.md b/web/pandas/about/citing.md index d5cb64e58f0ad..25d2c86061daa 100644 --- a/web/pandas/about/citing.md +++ b/web/pandas/about/citing.md @@ -2,31 +2,35 @@ ## Citing pandas -If you use _pandas_ for a scientific publication, we would appreciate citations to one of the following papers: +If you use _pandas_ for a scientific publication, we would appreciate citations to the published software and the +following paper: + +- [pandas on Zenodo](https://zenodo.org/record/3715232#.XoqFyC2ZOL8), + Please find us on Zenodo and replace with the citation for the version you are using. You cna replace the full author + list from there with "The pandas development team" like in the example below. + + @software{reback2020pandas, + author = {The pandas development team}, + title = {pandas-dev/pandas: Pandas}, + month = feb, + year = 2020, + publisher = {Zenodo}, + version = {latest}, + doi = {10.5281/zenodo.3509134}, + url = {https://doi.org/10.5281/zenodo.3509134} + } - [Data structures for statistical computing in python](https://conference.scipy.org/proceedings/scipy2010/pdfs/mckinney.pdf), McKinney, Proceedings of the 9th Python in Science Conference, Volume 445, 2010. - @inproceedings{mckinney2010data, - title={Data structures for statistical computing in python}, - author={Wes McKinney}, - booktitle={Proceedings of the 9th Python in Science Conference}, - volume={445}, - pages={51--56}, - year={2010}, - organization={Austin, TX} - } - - -- [pandas: a foundational Python library for data analysis and statistics](https://www.scribd.com/document/71048089/pandas-a-Foundational-Python-Library-for-Data-Analysis-and-Statistics), - McKinney, Python for High Performance and Scientific Computing, Volume 14, 2011. - - @article{mckinney2011pandas, - title={pandas: a foundational Python library for data analysis and statistics}, - author={Wes McKinney}, - journal={Python for High Performance and Scientific Computing}, - volume={14}, - year={2011} + @InProceedings{ mckinney-proc-scipy-2010, + author = { {W}es {M}c{K}inney }, + title = { {D}ata {S}tructures for {S}tatistical {C}omputing in {P}ython }, + booktitle = { {P}roceedings of the 9th {P}ython in {S}cience {C}onference }, + pages = { 56 - 61 }, + year = { 2010 }, + editor = { {S}t\'efan van der {W}alt and {J}arrod {M}illman }, + doi = { 10.25080/Majora-92bf1922-00a } } ## Brand and logo diff --git a/web/pandas/community/blog/2019-user-survey.md b/web/pandas/community/blog/2019-user-survey.md new file mode 100644 index 0000000000000..73c426e7cbec9 --- /dev/null +++ b/web/pandas/community/blog/2019-user-survey.md @@ -0,0 +1,172 @@ +Title: 2019 pandas user survey +Date: 2019-08-22 + + + +# 2019 pandas user survey + +Pandas recently conducted a user survey to help guide future development. +Thanks to everyone who participated! This post presents the high-level results. + +This analysis and the raw data can be found [on GitHub](https://github.com/pandas-dev/pandas-user-surveys) and run on Binder + +[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/pandas-dev/pandas-user-surveys/master?filepath=2019.ipynb) + + +We had about 1250 repsonses over the 15 days we ran the survey in the summer of 2019. + +## About the Respondents + +There was a fair amount of representation across pandas experience and frequeny of use, though the majority of respondents are on the more experienced side. + + + +![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_4_0.png) + + + + +![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_5_0.png) + + +We included a few questions that were also asked in the [Python Developers Survey](https://www.jetbrains.com/research/python-developers-survey-2018/) so we could compare Pandas' population to Python's. + +90% of our respondents use Python as a primary language (compared with 84% from the PSF survey). + + + + + + Yes 90.67% + No 9.33% + Name: Is Python your main language?, dtype: object + + + +Windows users are well represented (see [Steve Dower's talk](https://www.youtube.com/watch?v=uoI57uMdDD4) on this topic). + + + + + + Linux 61.57% + Windows 60.21% + MacOS 42.75% + Name: What Operating Systems do you use?, dtype: object + + + +For environment isolation, [conda](https://conda.io/en/latest/) was the most popular. + + + + +![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_13_0.png) + + +Most repondents are Python 3 only. + + + + + + 3 92.39% + 2 & 3 6.80% + 2 0.81% + Name: Python 2 or 3?, dtype: object + + + +## Pandas APIs + +It can be hard for open source projects to know what features are actually used. We asked a few questions to get an idea. + +CSV and Excel are (for better or worse) the most popular formats. + + + +![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_18_0.png) + + +In preperation for a possible refactor of pandas internals, we wanted to get a sense for +how common wide (100s of columns or more) DataFrames are. + + + +![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_20_0.png) + + +Pandas is slowly growing new exentension types. Categoricals are the most popular, +and the nullable integer type is already almost as popular as datetime with timezone. + + + +![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_22_0.png) + + +More and better examples seem to be a high-priority development item. +Pandas recently received a NumFOCUS grant to improve our documentation, +which we're using to write tutorial-style documentation, which should help +meet this need. + + + +![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_24_0.png) + + +We also asked about specific, commonly-requested features. + + + +![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_26_0.png) + + +Of these, the clear standout is "scaling" to large datasets. A couple observations: + +1. Perhaps pandas' documentation should do a better job of promoting libraries that provide scalable dataframes (like [Dask](https://dask.org), [vaex](https://dask.org), and [modin](https://modin.readthedocs.io/en/latest/)) +2. Memory efficiency (perhaps from a native string data type, fewer internal copies, etc.) is a valuable goal. + +After that, the next-most critical improvement is integer missing values. Those were actually added in [Pandas 0.24](https://pandas.pydata.org/pandas-docs/stable/whatsnew/v0.24.0.html#optional-integer-na-support), but they're not the default, and there's still some incompatibilites with the rest of pandas API. + +Pandas is a less conservative library than, say, NumPy. We're approaching 1.0, but on the way we've made many deprecations and some outright API breaking changes. Fortunately, most people are OK with the tradeoff. + + + + + + Yes 94.89% + No 5.11% + Name: Is Pandas stable enough for you?, dtype: object + + + +There's a perception (which is shared by many of the pandas maintainers) that the pandas API is too large. To measure that, we asked whether users thought that pandas' API was too large, too small, or just right. + + + +![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_31_0.png) + + +Finally, we asked for an overall satisfaction with the library, from 1 (not very unsatisfied) to 5 (very satisfied). + + + +![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_33_0.png) + + +Most people are very satisfied. The average response is 4.39. I look forward to tracking this number over time. + +If you're analyzing the raw data, be sure to share the results with us [@pandas_dev](https://twitter.com/pandas_dev). diff --git a/web/pandas/community/blog/extension-arrays.md b/web/pandas/community/blog/extension-arrays.md new file mode 100644 index 0000000000000..bc6179adfa719 --- /dev/null +++ b/web/pandas/community/blog/extension-arrays.md @@ -0,0 +1,218 @@ +Title: pandas extension arrays +Date: 2019-01-04 + +# pandas extension arrays + +Extensibility was a major theme in pandas development over the last couple of +releases. This post introduces the pandas extension array interface: the +motivation behind it and how it might affect you as a pandas user. Finally, we +look at how extension arrays may shape the future of pandas. + +Extension Arrays are just one of the changes in pandas 0.24.0. See the +[whatsnew][whatsnew] for a full changelog. + +## The Motivation + +Pandas is built on top of NumPy. You could roughly define a Series as a wrapper +around a NumPy array, and a DataFrame as a collection of Series with a shared +index. That's not entirely correct for several reasons, but I want to focus on +the "wrapper around a NumPy array" part. It'd be more correct to say "wrapper +around an array-like object". + +Pandas mostly uses NumPy's builtin data representation; we've restricted it in +places and extended it in others. For example, pandas' early users cared greatly +about timezone-aware datetimes, which NumPy doesn't support. So pandas +internally defined a `DatetimeTZ` dtype (which mimics a NumPy dtype), and +allowed you to use that dtype in `Index`, `Series`, and as a column in a +`DataFrame`. That dtype carried around the tzinfo, but wasn't itself a valid +NumPy dtype. + +As another example, consider `Categorical`. This actually composes *two* arrays: +one for the `categories` and one for the `codes`. But it can be stored in a +`DataFrame` like any other column. + +Each of these extension types pandas added is useful on its own, but carries a +high maintenance cost. Large sections of the codebase need to be aware of how to +handle a NumPy array or one of these other kinds of special arrays. This made +adding new extension types to pandas very difficult. + +Anaconda, Inc. had a client who regularly dealt with datasets with IP addresses. +They wondered if it made sense to add an [IPArray][IPArray] to pandas. In the +end, we didn't think it passed the cost-benefit test for inclusion in pandas +*itself*, but we were interested in defining an interface for third-party +extensions to pandas. Any object implementing this interface would be allowed in +pandas. I was able to write [cyberpandas][cyberpandas] outside of pandas, but it +feels like using any other dtype built into pandas. + +## The Current State + +As of pandas 0.24.0, all of pandas' internal extension arrays (Categorical, +Datetime with Timezone, Period, Interval, and Sparse) are now built on top of +the ExtensionArray interface. Users shouldn't notice many changes. The main +thing you'll notice is that things are cast to `object` dtype in fewer places, +meaning your code will run faster and your types will be more stable. This +includes storing `Period` and `Interval` data in `Series` (which were previously +cast to object dtype). + +Additionally, we'll be able to add *new* extension arrays with relative ease. +For example, 0.24.0 (optionally) solved one of pandas longest-standing pain +points: missing values casting integer-dtype values to float. + + +```python +>>> int_ser = pd.Series([1, 2], index=[0, 2]) +>>> int_ser +0 1 +2 2 +dtype: int64 + +>>> int_ser.reindex([0, 1, 2]) +0 1.0 +1 NaN +2 2.0 +dtype: float64 +``` + +With the new [IntegerArray][IntegerArray] and nullable integer dtypes, we can +natively represent integer data with missing values. + +```python +>>> int_ser = pd.Series([1, 2], index=[0, 2], dtype=pd.Int64Dtype()) +>>> int_ser +0 1 +2 2 +dtype: Int64 + +>>> int_ser.reindex([0, 1, 2]) +0 1 +1 NaN +2 2 +dtype: Int64 +``` + +One thing it does slightly change how you should access the raw (unlabeled) +arrays stored inside a Series or Index, which is occasionally useful. Perhaps +the method you're calling only works with NumPy arrays, or perhaps you want to +disable automatic alignment. + +In the past, you'd hear things like "Use `.values` to extract the NumPy array +from a Series or DataFrame." If it were a good resource, they'd tell you that's +not *entirely* true, since there are some exceptions. I'd like to delve into +those exceptions. + +The fundamental problem with `.values` is that it serves two purposes: + +1. Extracting the array backing a Series, Index, or DataFrame +2. Converting the Series, Index, or DataFrame to a NumPy array + +As we saw above, the "array" backing a Series or Index might not be a NumPy +array, it may instead be an extension array (from pandas or a third-party +library). For example, consider `Categorical`, + +```python +>>> cat = pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c']) +>>> ser = pd.Series(cat) +>>> ser +0 a +1 b +2 a +dtype: category +Categories (3, object): [a, b, c] + +>>> ser.values +[a, b, a] +Categories (3, object): [a, b, c] +``` + +In this case `.values` is a Categorical, not a NumPy array. For period-dtype +data, `.values` returns a NumPy array of `Period` objects, which is expensive to +create. For timezone-aware data, `.values` converts to UTC and *drops* the +timezone info. These kind of surprises (different types, or expensive or lossy +conversions) stem from trying to shoehorn these extension arrays into a NumPy +array. But the entire point of an extension array is for representing data NumPy +*can't* natively represent. + +To solve the `.values` problem, we've split its roles into two dedicated methods: + +1. Use `.array` to get a zero-copy reference to the underlying data +2. Use `.to_numpy()` to get a (potentially expensive, lossy) NumPy array of the + data. + +So with our Categorical example, + +```python +>>> ser.array +[a, b, a] +Categories (3, object): [a, b, c] + +>>> ser.to_numpy() +array(['a', 'b', 'a'], dtype=object) +``` + +To summarize: + +- `.array` will *always* be a an ExtensionArray, and is always a zero-copy + reference back to the data. +- `.to_numpy()` is *always* a NumPy array, so you can reliably call + ndarray-specific methods on it. + +You shouldn't ever need `.values` anymore. + +## Possible Future Paths + +Extension Arrays open up quite a few exciting opportunities. Currently, pandas +represents string data using Python objects in a NumPy array, which is slow. +Libraries like [Apache Arrow][arrow] provide native support for variable-length +strings, and the [Fletcher][fletcher] library provides pandas extension arrays +for Arrow arrays. It will allow [GeoPandas][geopandas] to store geometry data +more efficiently. Pandas (or third-party libraries) will be able to support +nested data, data with units, geo data, GPU arrays. Keep an eye on the +[pandas ecosystem][eco] page, which will keep track of third-party extension +arrays. It's an exciting time for pandas development. + +## Other Thoughts + +I'd like to emphasize that this is an *interface*, and not a concrete array +implementation. We are *not* reimplementing NumPy here in pandas. Rather, this +is a way to take any array-like data structure (one or more NumPy arrays, an +Apache Arrow array, a CuPy array) and place it inside a DataFrame. I think +getting pandas out of the array business, and instead thinking about +higher-level tabular data things, is a healthy development for the project. + +This works perfectly with NumPy's [`__array_ufunc__`][ufunc] protocol and +[NEP-18][nep18]. You'll be able to use the familiar NumPy API on objects that +aren't backed by NumPy memory. + +## Upgrade + +These new goodies are all available in the recently released pandas 0.24. + +conda: + + conda install -c conda-forge pandas + +pip: + + pip install --upgrade pandas + +As always, we're happy to hear feedback on the [mailing list][ml], +[@pandas-dev][twitter], or [issue tracker][tracker]. + +Thanks to the many contributors, maintainers, and [institutional +partners][partners] involved in the pandas community. + + +[IPArray]: https://github.com/pandas-dev/pandas/issues/18767 +[cyberpandas]: https://cyberpandas.readthedocs.io +[IntegerArray]: http://pandas.pydata.org/pandas-docs/version/0.24/reference/api/pandas.arrays.IntegerArray.html +[fletcher]: https://github.com/xhochy/fletcher +[arrow]: https://arrow.apache.org +[ufunc]: https://docs.scipy.org/doc/numpy-1.13.0/neps/ufunc-overrides.html +[nep18]: https://www.numpy.org/neps/nep-0018-array-function-protocol.html +[ml]: https://mail.python.org/mailman/listinfo/pandas-dev +[twitter]: https://twitter.com/pandas_dev +[tracker]: https://github.com/pandas-dev/pandas/issues +[partners]: https://github.com/pandas-dev/pandas-governance/blob/master/people.md +[eco]: http://pandas.pydata.org/pandas-docs/stable/ecosystem.html#extension-data-types +[whatsnew]: http://pandas.pydata.org/pandas-docs/version/0.24/whatsnew/v0.24.0.html +[geopandas]: https://github.com/geopandas/geopandas diff --git a/web/pandas/community/blog.html b/web/pandas/community/blog/index.html similarity index 100% rename from web/pandas/community/blog.html rename to web/pandas/community/blog/index.html diff --git a/web/pandas/community/blog/pandas-1.0.md b/web/pandas/community/blog/pandas-1.0.md new file mode 100644 index 0000000000000..b07c34a4ab6b5 --- /dev/null +++ b/web/pandas/community/blog/pandas-1.0.md @@ -0,0 +1,31 @@ +Title: pandas 1.0 +Date: 2020-01-29 + +# pandas 1.0 + +Today pandas celebrates its 1.0.0 release. In many ways this is just a normal release with a host of new features, performance improvements, and bug fixes, which are documented in our [release notes](https://pandas.pydata.org/pandas-docs/version/1.0.0/whatsnew/v1.0.0.html). But it’s also something a bit more — a milestone for the project beyond just the commits. We wanted to take some time to reflect on where we've been and where we're going. + +## Reflections + +The world of scientific Python has changed a lot since pandas was started. In 2011, [the ecosystem was fragmented](https://wesmckinney.com/blog/a-roadmap-for-rich-scientific-data-structures-in-python/): a standard *rich* data structure for statistics and data science had yet to emerge. This echos a similar story for NumPy, which consolidated array efforts that were [previously fragmented](https://numpy.org/old_array_packages.html). + +Over the subsequent years, pandas emerged as a *de facto* standard. It’s used by data scientists and analysts and as a data structure for other libraries to build on top of. StackOverflow [cited pandas](https://stackoverflow.blog/2017/09/14/python-growing-quickly/) as one of the reasons for Python being the fastest growing major programming language. + +![Growth of pandas](https://149351115.v2.pressablecdn.com/wp-content/uploads/2017/09/related_tags_over_time-1-1000x1000.png) + +Today, the ecosystem is in another phase of exploration. +Several new DataFrame implementations are cropping up to fill needs not met by pandas. +We're [working with those projects](https://datapythonista.me/blog/dataframe-summit-at-euroscipy.html) to establish shared standards and semantics for rich data structures. + +## Community and Project Health + +This release cycle is the first to involve any kind of grant funding for pandas. [Pandas received funding](https://chanzuckerberg.com/eoss/proposals/) as part of the CZI’s [*Essential Open Source Software for Science*](https://medium.com/@cziscience/the-invisible-foundations-of-biomedicine-4ab7f8d4f5dd) [program](https://medium.com/@cziscience/the-invisible-foundations-of-biomedicine-4ab7f8d4f5dd). The pandas project relies overwhelmingly on volunteer contributors. These volunteer contributions are shepherded and augmented by some maintainers who are given time from their employers — our [institutional partners](https://github.com/pandas-dev/pandas-governance/blob/master/people.md#institutional-partners). The largest work item in our grant award was library maintenance, which specifically includes working with community members to address our large backlog of open issues and pull requests. + +While a “1.0.0” version might seem arbitrary or anti-climactic (given that pandas as a codebase is nearly 12 years old), we see it as a symbolic milestone celebrating the growth of our core developer team and depth of our contributor base. Few open source projects are ever truly “done” and pandas is no different. We recognize the essential role that pandas now occupies, and we intend to continue to evolve the project and adapt to the needs of the world’s data wranglers. + +## Going Forward + +Our [roadmap](https://pandas.pydata.org/pandas-docs/version/1.0.0/development/roadmap.html) contains an up-to-date listing of where we see the project heading over the next few years. +Needless to say, there's still plenty to do. + +Check out the [release notes](https://pandas.pydata.org/pandas-docs/version/1.0.0/whatsnew/v1.0.0.html) and visit the [installation page](https://pandas.pydata.org/pandas-docs/version/1.0.0/getting_started/install.html) for instructions on updating to pandas 1.0. diff --git a/web/pandas/community/coc.md b/web/pandas/community/coc.md index bf62f4e00f847..d2af9c3fdd25b 100644 --- a/web/pandas/community/coc.md +++ b/web/pandas/community/coc.md @@ -20,6 +20,9 @@ Examples of unacceptable behavior by participants include: addresses, without explicit permission * Other unethical or unprofessional conduct +Furthermore, we encourage inclusive behavior - for example, +please don’t say “hey guys!” but “hey everyone!”. + Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or diff --git a/web/pandas/config.yml b/web/pandas/config.yml index a52c580f23530..23575cc123050 100644 --- a/web/pandas/config.yml +++ b/web/pandas/config.yml @@ -15,6 +15,7 @@ main: - toc - tables - fenced_code + - meta static: logo: /static/img/pandas_white.svg css: @@ -23,7 +24,7 @@ navbar: - name: "About us" target: - name: "About pandas" - target: /about/index.html + target: /about/ - name: "Project roadmap" target: /about/roadmap.html - name: "Team" @@ -39,7 +40,7 @@ navbar: - name: "Community" target: - name: "Blog" - target: /community/blog.html + target: /community/blog/ - name: "Ask a question (StackOverflow)" target: https://stackoverflow.com/questions/tagged/pandas - name: "Code of conduct" @@ -49,9 +50,11 @@ navbar: - name: "Contribute" target: /contribute.html blog: - num_posts: 8 + num_posts: 50 + posts_path: community/blog + author: "pandas team" + feed_name: "pandas blog" feed: - - https://dev.pandas.io/pandas-blog/feeds/all.atom.xml - https://wesmckinney.com/feeds/pandas.atom.xml - https://tomaugspurger.github.io/feed - https://jorisvandenbossche.github.io/feeds/pandas.atom.xml @@ -127,7 +130,7 @@ sponsors: url: https://chanzuckerberg.com/ logo: /static/img/partners/czi.svg kind: regular - description: "pandas is funded by the Essential Open Source Software for Science program of the Chan Zuckerberg Initiative. The funding is used for general maintainance, improve extension types, and a efficient string type." + description: "pandas is funded by the Essential Open Source Software for Science program of the Chan Zuckerberg Initiative. The funding is used for general maintenance, improve extension types, and a efficient string type." inkind: # not included in active so they don't appear in the home page - name: "OVH" url: https://us.ovhcloud.com/ diff --git a/web/pandas/static/img/blog/2019-user-survey/2019_13_0.png b/web/pandas/static/img/blog/2019-user-survey/2019_13_0.png new file mode 100644 index 0000000000000..9ce2ff483f2c2 Binary files /dev/null and b/web/pandas/static/img/blog/2019-user-survey/2019_13_0.png differ diff --git a/web/pandas/static/img/blog/2019-user-survey/2019_18_0.png b/web/pandas/static/img/blog/2019-user-survey/2019_18_0.png new file mode 100644 index 0000000000000..63b2c93b0573d Binary files /dev/null and b/web/pandas/static/img/blog/2019-user-survey/2019_18_0.png differ diff --git a/web/pandas/static/img/blog/2019-user-survey/2019_20_0.png b/web/pandas/static/img/blog/2019-user-survey/2019_20_0.png new file mode 100644 index 0000000000000..1c7abb0434dad Binary files /dev/null and b/web/pandas/static/img/blog/2019-user-survey/2019_20_0.png differ diff --git a/web/pandas/static/img/blog/2019-user-survey/2019_22_0.png b/web/pandas/static/img/blog/2019-user-survey/2019_22_0.png new file mode 100644 index 0000000000000..5ef3d69b48700 Binary files /dev/null and b/web/pandas/static/img/blog/2019-user-survey/2019_22_0.png differ diff --git a/web/pandas/static/img/blog/2019-user-survey/2019_24_0.png b/web/pandas/static/img/blog/2019-user-survey/2019_24_0.png new file mode 100644 index 0000000000000..1a15be05af92d Binary files /dev/null and b/web/pandas/static/img/blog/2019-user-survey/2019_24_0.png differ diff --git a/web/pandas/static/img/blog/2019-user-survey/2019_26_0.png b/web/pandas/static/img/blog/2019-user-survey/2019_26_0.png new file mode 100644 index 0000000000000..4f8d9f2c439ae Binary files /dev/null and b/web/pandas/static/img/blog/2019-user-survey/2019_26_0.png differ diff --git a/web/pandas/static/img/blog/2019-user-survey/2019_31_0.png b/web/pandas/static/img/blog/2019-user-survey/2019_31_0.png new file mode 100644 index 0000000000000..6c8b5f1108f79 Binary files /dev/null and b/web/pandas/static/img/blog/2019-user-survey/2019_31_0.png differ diff --git a/web/pandas/static/img/blog/2019-user-survey/2019_33_0.png b/web/pandas/static/img/blog/2019-user-survey/2019_33_0.png new file mode 100644 index 0000000000000..fd490d3e7255a Binary files /dev/null and b/web/pandas/static/img/blog/2019-user-survey/2019_33_0.png differ diff --git a/web/pandas/static/img/blog/2019-user-survey/2019_4_0.png b/web/pandas/static/img/blog/2019-user-survey/2019_4_0.png new file mode 100644 index 0000000000000..5276ed359badb Binary files /dev/null and b/web/pandas/static/img/blog/2019-user-survey/2019_4_0.png differ diff --git a/web/pandas/static/img/blog/2019-user-survey/2019_5_0.png b/web/pandas/static/img/blog/2019-user-survey/2019_5_0.png new file mode 100644 index 0000000000000..a252e1c9b3503 Binary files /dev/null and b/web/pandas/static/img/blog/2019-user-survey/2019_5_0.png differ diff --git a/web/pandas_web.py b/web/pandas_web.py index 38ab78f5690e7..e62deaa8cdc7f 100755 --- a/web/pandas_web.py +++ b/web/pandas_web.py @@ -78,6 +78,47 @@ def blog_add_posts(context): """ tag_expr = re.compile("<.*?>") posts = [] + # posts from the file system + if context["blog"]["posts_path"]: + posts_path = os.path.join( + context["source_path"], *context["blog"]["posts_path"].split("/") + ) + for fname in os.listdir(posts_path): + if fname.startswith("index."): + continue + link = ( + f"/{context['blog']['posts_path']}" + f"/{os.path.splitext(fname)[0]}.html" + ) + md = markdown.Markdown( + extensions=context["main"]["markdown_extensions"] + ) + with open(os.path.join(posts_path, fname)) as f: + html = md.convert(f.read()) + title = md.Meta["title"][0] + summary = re.sub(tag_expr, "", html) + try: + body_position = summary.index(title) + len(title) + except ValueError: + raise ValueError( + f'Blog post "{fname}" should have a markdown header ' + f'corresponding to its "Title" element "{title}"' + ) + summary = " ".join(summary[body_position:].split(" ")[:30]) + posts.append( + { + "title": title, + "author": context["blog"]["author"], + "published": datetime.datetime.strptime( + md.Meta["date"][0], "%Y-%m-%d" + ), + "feed": context["blog"]["feed_name"], + "link": link, + "description": summary, + "summary": summary, + } + ) + # posts from rss feeds for feed_url in context["blog"]["feed"]: feed_data = feedparser.parse(feed_url) for entry in feed_data.entries: @@ -180,6 +221,7 @@ def get_context(config_fname: str, ignore_io_errors: bool, **kwargs): with open(config_fname) as f: context = yaml.safe_load(f) + context["source_path"] = os.path.dirname(config_fname) context["ignore_io_errors"] = ignore_io_errors context.update(kwargs)