From b7fcb545a1a298817cd7d9f8940f19992d1202d2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 9 Jan 2020 18:11:30 -0800 Subject: [PATCH 001/158] CLN: remove unnecessary overriding in subclasses (#30875) --- pandas/core/indexes/category.py | 13 ------------- pandas/core/indexes/interval.py | 18 ++---------------- 2 files changed, 2 insertions(+), 29 deletions(-) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 41072d4ce6a93..a247a986fcb55 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -428,19 +428,6 @@ def _engine(self): codes = self.codes return self._engine_type(lambda: codes, len(self)) - # introspection - @cache_readonly - def is_unique(self) -> bool: - return self._engine.is_unique - - @property - def is_monotonic_increasing(self): - return self._engine.is_monotonic_increasing - - @property - def is_monotonic_decreasing(self) -> bool: - return self._engine.is_monotonic_decreasing - @Appender(_index_shared_docs["index_unique"] % _index_doc_kwargs) def unique(self, level=None): if level is not None: diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index d33ba52cc7524..1c86235f9eaa1 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -437,22 +437,8 @@ def memory_usage(self, deep: bool = False) -> int: # so return the bytes here return self.left.memory_usage(deep=deep) + self.right.memory_usage(deep=deep) - @cache_readonly - def is_monotonic(self) -> bool: - """ - Return True if the IntervalIndex is monotonic increasing (only equal or - increasing values), else False - """ - return self.is_monotonic_increasing - - @cache_readonly - def is_monotonic_increasing(self) -> bool: - """ - Return True if the IntervalIndex is monotonic increasing (only equal or - increasing values), else False - """ - return self._engine.is_monotonic_increasing - + # IntervalTree doesn't have a is_monotonic_decreasing, so have to override + # the Index implemenation @cache_readonly def is_monotonic_decreasing(self) -> bool: """ From 6f2c509984de999b09d44efd1e96dff92038afcf Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 10 Jan 2020 15:01:10 +0100 Subject: [PATCH 002/158] DEPR: fix missing stacklevel in pandas.core.index deprecation (#30878) --- pandas/core/index.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/index.py b/pandas/core/index.py index a9c8e6731a17e..8cff53d7a8b74 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -27,4 +27,5 @@ "pandas.core.index is deprecated and will be removed in a future version. " "The public classes are available in the top-level namespace.", FutureWarning, + stacklevel=2, ) From d1b9598d69af350f718128c567a856848cff595d Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 10 Jan 2020 11:22:37 -0800 Subject: [PATCH 003/158] DOC: Encourage use of pre-commit in the docs (#30864) Previously, we stated it as merely optional xref: https://github.com/pandas-dev/pandas/pull/30773 https://github.com/pandas-dev/pandas/pull/30814 --- doc/source/development/contributing.rst | 51 +++++++++++++++++-------- 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 93c65ba7358c9..2dc5ed07544d1 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -635,6 +635,8 @@ many errors as possible, but it may not correct *all* of them. Thus, it is recommended that you run ``cpplint`` to double check and make any other style fixes manually. +.. _contributing.code-formatting: + Python (PEP8 / black) ~~~~~~~~~~~~~~~~~~~~~ @@ -656,19 +658,8 @@ apply ``black`` as you edit files. You should use a ``black`` version >= 19.10b0 as previous versions are not compatible with the pandas codebase. -Optionally, you may wish to setup `pre-commit hooks `_ -to automatically run ``black`` and ``flake8`` when you make a git commit. This -can be done by installing ``pre-commit``:: - - pip install pre-commit - -and then running:: - - pre-commit install - -from the root of the pandas repository. Now ``black`` and ``flake8`` will be run -each time you commit changes. You can skip these checks with -``git commit --no-verify``. +If you wish to run these checks automatically, we encourage you to use +:ref:`pre-commits ` instead. One caveat about ``git diff upstream/master -u -- "*.py" | flake8 --diff``: this command will catch any stylistic errors in your changes specifically, but @@ -676,7 +667,7 @@ be beware it may not catch all of them. For example, if you delete the only usage of an imported function, it is stylistically incorrect to import an unused function. However, style-checking the diff will not catch this because the actual import is not part of the diff. Thus, for completeness, you should -run this command, though it will take longer:: +run this command, though it may take longer:: git diff upstream/master --name-only -- "*.py" | xargs -r flake8 @@ -694,6 +685,8 @@ behaviour as follows:: This will get all the files being changed by the PR (and ending with ``.py``), and run ``flake8`` on them, one after the other. +Note that these commands can be run analogously with ``black``. + .. _contributing.import-formatting: Import formatting @@ -716,7 +709,6 @@ A summary of our current import sections ( in order ): Imports are alphabetically sorted within these sections. - As part of :ref:`Continuous Integration ` checks we run:: isort --recursive --check-only pandas @@ -740,8 +732,37 @@ to automatically format imports correctly. This will modify your local copy of t The `--recursive` flag can be passed to sort all files in a directory. +Alternatively, you can run a command similar to what was suggested for ``black`` and ``flake8`` :ref:`right above `:: + + git diff upstream/master --name-only -- "*.py" | xargs -r isort + +Where similar caveats apply if you are on OSX or Windows. + You can then verify the changes look ok, then git :ref:`commit ` and :ref:`push `. +.. _contributing.pre-commit: + +Pre-Commit +~~~~~~~~~~ + +You can run many of these styling checks manually as we have described above. However, +we encourage you to use `pre-commit hooks `_ instead +to automatically run ``black``, ``flake8``, ``isort`` when you make a git commit. This +can be done by installing ``pre-commit``:: + + pip install pre-commit + +and then running:: + + pre-commit install + +from the root of the pandas repository. Now all of the styling checks will be +run each time you commit changes without your needing to run each one manually. +In addition, using this pre-commit hook will also allow you to more easily +remain up-to-date with our code checks as they change. + +Note that if needed, you can skip these checks with ``git commit --no-verify``. + Backwards compatibility ~~~~~~~~~~~~~~~~~~~~~~~ From 447a3b008c695095c0db009965285080a72c402c Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Fri, 10 Jan 2020 19:26:20 +0000 Subject: [PATCH 004/158] WEB: Removing Discourse links (#30890) We are not using them for now. --- web/pandas/_templates/layout.html | 5 ----- web/pandas/config.yml | 2 -- 2 files changed, 7 deletions(-) diff --git a/web/pandas/_templates/layout.html b/web/pandas/_templates/layout.html index 120058afd1190..92126a7b5a2f2 100644 --- a/web/pandas/_templates/layout.html +++ b/web/pandas/_templates/layout.html @@ -84,11 +84,6 @@ -
  • - - - -
  • pandas is a fiscally sponsored project of NumFOCUS diff --git a/web/pandas/config.yml b/web/pandas/config.yml index e2a95a5039884..d1fb7ba0f7b86 100644 --- a/web/pandas/config.yml +++ b/web/pandas/config.yml @@ -50,8 +50,6 @@ navbar: target: /community/blog.html - name: "Ask a question (StackOverflow)" target: https://stackoverflow.com/questions/tagged/pandas - - name: "Discuss" - target: https://pandas.discourse.group - name: "Code of conduct" target: /community/coc.html - name: "Ecosystem" From 03cdcb62089b79d548543279978effcd2c670a63 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Fri, 10 Jan 2020 23:21:20 +0000 Subject: [PATCH 005/158] WEB: Remove from roadmap moving the docstring script (#30893) --- doc/source/development/roadmap.rst | 14 -------------- web/pandas/about/roadmap.md | 13 ------------- 2 files changed, 27 deletions(-) diff --git a/doc/source/development/roadmap.rst b/doc/source/development/roadmap.rst index 00598830e2fe9..fafe63d80249c 100644 --- a/doc/source/development/roadmap.rst +++ b/doc/source/development/roadmap.rst @@ -129,20 +129,6 @@ Some specific goals include * Improve the overall organization of the documentation and specific subsections of the documentation to make navigation and finding content easier. -Package docstring validation ----------------------------- - -To improve the quality and consistency of pandas docstrings, we've developed -tooling to check docstrings in a variety of ways. -https://github.com/pandas-dev/pandas/blob/master/scripts/validate_docstrings.py -contains the checks. - -Like many other projects, pandas uses the -`numpydoc `__ style for writing -docstrings. With the collaboration of the numpydoc maintainers, we'd like to -move the checks to a package other than pandas so that other projects can easily -use them as well. - Performance monitoring ---------------------- diff --git a/web/pandas/about/roadmap.md b/web/pandas/about/roadmap.md index 8a5c2735b3d93..35a6b3361f32e 100644 --- a/web/pandas/about/roadmap.md +++ b/web/pandas/about/roadmap.md @@ -134,19 +134,6 @@ pandas documentation. Some specific goals include subsections of the documentation to make navigation and finding content easier. -## Package docstring validation - -To improve the quality and consistency of pandas docstrings, we've -developed tooling to check docstrings in a variety of ways. - -contains the checks. - -Like many other projects, pandas uses the -[numpydoc](https://numpydoc.readthedocs.io/en/latest/) style for writing -docstrings. With the collaboration of the numpydoc maintainers, we'd -like to move the checks to a package other than pandas so that other -projects can easily use them as well. - ## Performance monitoring Pandas uses [airspeed velocity](https://asv.readthedocs.io/en/stable/) From 0b4bac700f5a0809213e7ad9a8e78f5cb1244c62 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Sat, 11 Jan 2020 01:48:03 +0200 Subject: [PATCH 006/158] TYP: typing annotations (#30901) --- pandas/_config/display.py | 3 ++- pandas/_config/localization.py | 6 +++--- pandas/compat/numpy/function.py | 34 ++++++++++++++++++++++----------- 3 files changed, 28 insertions(+), 15 deletions(-) diff --git a/pandas/_config/display.py b/pandas/_config/display.py index 067b7c503baab..ef319f4447565 100644 --- a/pandas/_config/display.py +++ b/pandas/_config/display.py @@ -1,6 +1,7 @@ """ Unopinionated display configuration. """ + import locale import sys @@ -11,7 +12,7 @@ _initial_defencoding = None -def detect_console_encoding(): +def detect_console_encoding() -> str: """ Try to find the most capable encoding supported by the console. slightly modified from the way IPython handles the same issue. diff --git a/pandas/_config/localization.py b/pandas/_config/localization.py index dd1d4948aa6e3..0d68e78372d8a 100644 --- a/pandas/_config/localization.py +++ b/pandas/_config/localization.py @@ -12,7 +12,7 @@ @contextmanager -def set_locale(new_locale, lc_var=locale.LC_ALL): +def set_locale(new_locale, lc_var: int = locale.LC_ALL): """ Context manager for temporarily setting a locale. @@ -44,7 +44,7 @@ def set_locale(new_locale, lc_var=locale.LC_ALL): locale.setlocale(lc_var, current_locale) -def can_set_locale(lc, lc_var=locale.LC_ALL): +def can_set_locale(lc: str, lc_var: int = locale.LC_ALL) -> bool: """ Check to see if we can set a locale, and subsequently get the locale, without raising an Exception. @@ -58,7 +58,7 @@ def can_set_locale(lc, lc_var=locale.LC_ALL): Returns ------- - is_valid : bool + bool Whether the passed locale can be set """ diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index 7158f251ad805..50f234cbf9419 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -33,13 +33,26 @@ class CompatValidator: - def __init__(self, defaults, fname=None, method=None, max_fname_arg_count=None): + def __init__( + self, + defaults, + fname=None, + method: Optional[str] = None, + max_fname_arg_count=None, + ): self.fname = fname self.method = method self.defaults = defaults self.max_fname_arg_count = max_fname_arg_count - def __call__(self, args, kwargs, fname=None, max_fname_arg_count=None, method=None): + def __call__( + self, + args, + kwargs, + fname=None, + max_fname_arg_count=None, + method: Optional[str] = None, + ) -> None: if args or kwargs: fname = self.fname if fname is None else fname max_fname_arg_count = ( @@ -300,7 +313,7 @@ def validate_take_with_convert(convert, args, kwargs): ) -def validate_window_func(name, args, kwargs): +def validate_window_func(name, args, kwargs) -> None: numpy_args = ("axis", "dtype", "out") msg = ( f"numpy operations are not valid with window objects. " @@ -315,7 +328,7 @@ def validate_window_func(name, args, kwargs): raise UnsupportedFunctionCall(msg) -def validate_rolling_func(name, args, kwargs): +def validate_rolling_func(name, args, kwargs) -> None: numpy_args = ("axis", "dtype", "out") msg = ( f"numpy operations are not valid with window objects. " @@ -330,7 +343,7 @@ def validate_rolling_func(name, args, kwargs): raise UnsupportedFunctionCall(msg) -def validate_expanding_func(name, args, kwargs): +def validate_expanding_func(name, args, kwargs) -> None: numpy_args = ("axis", "dtype", "out") msg = ( f"numpy operations are not valid with window objects. " @@ -345,7 +358,7 @@ def validate_expanding_func(name, args, kwargs): raise UnsupportedFunctionCall(msg) -def validate_groupby_func(name, args, kwargs, allowed=None): +def validate_groupby_func(name, args, kwargs, allowed=None) -> None: """ 'args' and 'kwargs' should be empty, except for allowed kwargs because all of @@ -359,16 +372,15 @@ def validate_groupby_func(name, args, kwargs, allowed=None): if len(args) + len(kwargs) > 0: raise UnsupportedFunctionCall( - f"numpy operations are not valid with " - f"groupby. Use .groupby(...).{name}() " - f"instead" + "numpy operations are not valid with groupby. " + f"Use .groupby(...).{name}() instead" ) RESAMPLER_NUMPY_OPS = ("min", "max", "sum", "prod", "mean", "std", "var") -def validate_resampler_func(method, args, kwargs): +def validate_resampler_func(method: str, args, kwargs) -> None: """ 'args' and 'kwargs' should be empty because all of their necessary parameters are explicitly listed in @@ -385,7 +397,7 @@ def validate_resampler_func(method, args, kwargs): raise TypeError("too many arguments passed in") -def validate_minmax_axis(axis): +def validate_minmax_axis(axis: Optional[int]) -> None: """ Ensure that the axis argument passed to min, max, argmin, or argmax is zero or None, as otherwise it will be incorrectly ignored. From f887eb09ba19311408717c0bed1f36732ab8f71a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 10 Jan 2020 15:57:33 -0800 Subject: [PATCH 007/158] TYP: offsets (#30897) --- pandas/tseries/offsets.py | 90 +++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 8bb98a271bce8..d31c23c7ccf1d 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -365,7 +365,7 @@ def apply_index(self, i): "applied vectorized" ) - def is_anchored(self): + def is_anchored(self) -> bool: # TODO: Does this make sense for the general case? It would help # if there were a canonical docstring for what is_anchored means. return self.n == 1 @@ -378,7 +378,7 @@ def onOffset(self, dt): ) return self.is_on_offset(dt) - def isAnchored(self): + def isAnchored(self) -> bool: warnings.warn( "isAnchored is a deprecated, use is_anchored instead", FutureWarning, @@ -389,7 +389,7 @@ def isAnchored(self): # TODO: Combine this with BusinessMixin version by defining a whitelisted # set of attributes on each object rather than the existing behavior of # iterating over internal ``__dict__`` - def _repr_attrs(self): + def _repr_attrs(self) -> str: exclude = {"n", "inc", "normalize"} attrs = [] for attr in sorted(self.__dict__): @@ -405,7 +405,7 @@ def _repr_attrs(self): return out @property - def name(self): + def name(self) -> str: return self.rule_code def rollback(self, dt): @@ -452,15 +452,15 @@ def is_on_offset(self, dt): # way to get around weirdness with rule_code @property - def _prefix(self): + def _prefix(self) -> str: raise NotImplementedError("Prefix not defined") @property - def rule_code(self): + def rule_code(self) -> str: return self._prefix @cache_readonly - def freqstr(self): + def freqstr(self) -> str: try: code = self.rule_code except NotImplementedError: @@ -480,7 +480,7 @@ def freqstr(self): return fstr - def _offset_str(self): + def _offset_str(self) -> str: return "" @property @@ -529,11 +529,11 @@ def offset(self): # Alias for backward compat return self._offset - def _repr_attrs(self): + def _repr_attrs(self) -> str: if self.offset: attrs = [f"offset={repr(self.offset)}"] else: - attrs = None + attrs = [] out = "" if attrs: out += ": " + ", ".join(attrs) @@ -553,7 +553,7 @@ def __init__(self, n=1, normalize=False, offset=timedelta(0)): BaseOffset.__init__(self, n, normalize) object.__setattr__(self, "_offset", offset) - def _offset_str(self): + def _offset_str(self) -> str: def get_str(td): off_str = "" if td.days > 0: @@ -649,7 +649,7 @@ def apply_index(self, i): result = shifted.to_timestamp() + time return result - def is_on_offset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False return dt.weekday() < 5 @@ -1087,7 +1087,7 @@ def apply(self, other): def apply_index(self, i): raise NotImplementedError - def is_on_offset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False day64 = _to_dt64(dt, "datetime64[D]") @@ -1134,14 +1134,14 @@ class MonthOffset(SingleConstructorOffset): __init__ = BaseOffset.__init__ @property - def name(self): + def name(self) -> str: if self.is_anchored: return self.rule_code else: month = ccalendar.MONTH_ALIASES[self.n] return f"{self.code_rule}-{month}" - def is_on_offset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False return dt.day == self._get_offset_day(dt) @@ -1333,7 +1333,7 @@ def _from_name(cls, suffix=None): return cls(day_of_month=suffix) @property - def rule_code(self): + def rule_code(self) -> str: suffix = f"-{self.day_of_month}" return self._prefix + suffix @@ -1429,7 +1429,7 @@ class SemiMonthEnd(SemiMonthOffset): _prefix = "SM" _min_day_of_month = 1 - def is_on_offset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False days_in_month = ccalendar.get_days_in_month(dt.year, dt.month) @@ -1487,7 +1487,7 @@ class SemiMonthBegin(SemiMonthOffset): _prefix = "SMS" - def is_on_offset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False return dt.day in (1, self.day_of_month) @@ -1556,7 +1556,7 @@ def __init__(self, n=1, normalize=False, weekday=None): if self.weekday < 0 or self.weekday > 6: raise ValueError(f"Day must be 0<=day<=6, got {self.weekday}") - def is_anchored(self): + def is_anchored(self) -> bool: return self.n == 1 and self.weekday is not None @apply_wraps @@ -1632,7 +1632,7 @@ def _end_apply_index(self, dtindex): return base + off + Timedelta(1, "ns") - Timedelta(1, "D") - def is_on_offset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False elif self.weekday is None: @@ -1640,7 +1640,7 @@ def is_on_offset(self, dt): return dt.weekday() == self.weekday @property - def rule_code(self): + def rule_code(self) -> str: suffix = "" if self.weekday is not None: weekday = ccalendar.int_to_weekday[self.weekday] @@ -1717,7 +1717,7 @@ def __init__(self, n=1, normalize=False, week=0, weekday=0): if self.week < 0 or self.week > 3: raise ValueError(f"Week must be 0<=week<=3, got {self.week}") - def _get_offset_day(self, other): + def _get_offset_day(self, other: datetime) -> int: """ Find the day in the same month as other that has the same weekday as self.weekday and is the self.week'th such day in the month. @@ -1736,7 +1736,7 @@ def _get_offset_day(self, other): return 1 + shift_days + self.week * 7 @property - def rule_code(self): + def rule_code(self) -> str: weekday = ccalendar.int_to_weekday.get(self.weekday, "") return f"{self._prefix}-{self.week + 1}{weekday}" @@ -1785,7 +1785,7 @@ def __init__(self, n=1, normalize=False, weekday=0): if self.weekday < 0 or self.weekday > 6: raise ValueError(f"Day must be 0<=day<=6, got {self.weekday}") - def _get_offset_day(self, other): + def _get_offset_day(self, other: datetime) -> int: """ Find the day in the same month as other that has the same weekday as self.weekday and is the last such day in the month. @@ -1805,7 +1805,7 @@ def _get_offset_day(self, other): return dim - shift_days @property - def rule_code(self): + def rule_code(self) -> str: weekday = ccalendar.int_to_weekday.get(self.weekday, "") return f"{self._prefix}-{weekday}" @@ -1842,7 +1842,7 @@ def __init__(self, n=1, normalize=False, startingMonth=None): startingMonth = self._default_startingMonth object.__setattr__(self, "startingMonth", startingMonth) - def is_anchored(self): + def is_anchored(self) -> bool: return self.n == 1 and self.startingMonth is not None @classmethod @@ -1856,7 +1856,7 @@ def _from_name(cls, suffix=None): return cls(**kwargs) @property - def rule_code(self): + def rule_code(self) -> str: month = ccalendar.MONTH_ALIASES[self.startingMonth] return f"{self._prefix}-{month}" @@ -1874,7 +1874,7 @@ def apply(self, other): months = qtrs * 3 - months_since return shift_month(other, months, self._day_opt) - def is_on_offset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False mod_month = (dt.month - self.startingMonth) % 3 @@ -1953,7 +1953,7 @@ class YearOffset(DateOffset): _adjust_dst = True _attributes = frozenset(["n", "normalize", "month"]) - def _get_offset_day(self, other): + def _get_offset_day(self, other: datetime) -> int: # override BaseOffset method to use self.month instead of other.month # TODO: there may be a more performant way to do this return liboffsets.get_day_of_month( @@ -1977,7 +1977,7 @@ def apply_index(self, dtindex): shifted, freq=dtindex.freq, dtype=dtindex.dtype ) - def is_on_offset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False return dt.month == self.month and dt.day == self._get_offset_day(dt) @@ -1999,7 +1999,7 @@ def _from_name(cls, suffix=None): return cls(**kwargs) @property - def rule_code(self): + def rule_code(self) -> str: month = ccalendar.MONTH_ALIASES[self.month] return f"{self._prefix}-{month}" @@ -2117,12 +2117,12 @@ def __init__( if self.variation not in ["nearest", "last"]: raise ValueError(f"{self.variation} is not a valid variation") - def is_anchored(self): + def is_anchored(self) -> bool: return ( self.n == 1 and self.startingMonth is not None and self.weekday is not None ) - def is_on_offset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False dt = datetime(dt.year, dt.month, dt.day) @@ -2217,18 +2217,18 @@ def get_year_end(self, dt): return target_date + timedelta(days_forward - 7) @property - def rule_code(self): + def rule_code(self) -> str: prefix = self._prefix suffix = self.get_rule_code_suffix() return f"{prefix}-{suffix}" - def _get_suffix_prefix(self): + def _get_suffix_prefix(self) -> str: if self.variation == "nearest": return "N" else: return "L" - def get_rule_code_suffix(self): + def get_rule_code_suffix(self) -> str: prefix = self._get_suffix_prefix() month = ccalendar.MONTH_ALIASES[self.startingMonth] weekday = ccalendar.int_to_weekday[self.weekday] @@ -2346,7 +2346,7 @@ def _offset(self): variation=self.variation, ) - def is_anchored(self): + def is_anchored(self) -> bool: return self.n == 1 and self._offset.is_anchored() def _rollback_to_year(self, other): @@ -2434,7 +2434,7 @@ def get_weeks(self, dt): return ret - def year_has_extra_week(self, dt): + def year_has_extra_week(self, dt: datetime) -> bool: # Avoid round-down errors --> normalize to get # e.g. '370D' instead of '360D23H' norm = Timestamp(dt).normalize().tz_localize(None) @@ -2445,7 +2445,7 @@ def year_has_extra_week(self, dt): assert weeks_in_year in [52, 53], weeks_in_year return weeks_in_year == 53 - def is_on_offset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False if self._offset.is_on_offset(dt): @@ -2463,7 +2463,7 @@ def is_on_offset(self, dt): return False @property - def rule_code(self): + def rule_code(self) -> str: suffix = self._offset.get_rule_code_suffix() qtr = self.qtr_with_extra_week return f"{self._prefix}-{suffix}-{qtr}" @@ -2516,7 +2516,7 @@ def apply(self, other): ) return new - def is_on_offset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False return date(dt.year, dt.month, dt.day) == easter(dt.year) @@ -2596,7 +2596,7 @@ def __eq__(self, other: Any) -> bool: # This is identical to DateOffset.__hash__, but has to be redefined here # for Python 3, because we've redefined __eq__. - def __hash__(self): + def __hash__(self) -> int: return hash(self._params) def __ne__(self, other): @@ -2617,7 +2617,7 @@ def __ne__(self, other): return True @property - def delta(self): + def delta(self) -> Timedelta: return self.n * self._inc @property @@ -2648,11 +2648,11 @@ def apply(self, other): raise ApplyTypeError(f"Unhandled type: {type(other).__name__}") - def is_anchored(self): + def is_anchored(self) -> bool: return False -def _delta_to_tick(delta): +def _delta_to_tick(delta: timedelta) -> Tick: if delta.microseconds == 0 and getattr(delta, "nanoseconds", 0) == 0: # nanoseconds only for pd.Timedelta if delta.seconds == 0: From 6e9651e40db39e738ad3f5db09591f877159ecbb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 11 Jan 2020 01:53:41 -0800 Subject: [PATCH 008/158] BUG: pickle files left behind by tm.round_trip_pickle (#30906) --- pandas/_testing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index 0b81fb0f7a8d5..1fdc5d478aaf6 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -122,9 +122,9 @@ def round_trip_pickle( _path = path if _path is None: _path = f"__{rands(10)}__.pickle" - with ensure_clean(_path) as path: - pd.to_pickle(obj, _path) - return pd.read_pickle(_path) + with ensure_clean(_path) as temp_path: + pd.to_pickle(obj, temp_path) + return pd.read_pickle(temp_path) def round_trip_pathlib(writer, reader, path: Optional[str] = None): From 398bf7d73ad019a3a9c611660acccde301fd3945 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Sun, 12 Jan 2020 00:26:53 +0800 Subject: [PATCH 009/158] BUG: Series rolling count ignores min_periods (GH26996) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/window/rolling.py | 4 +++- pandas/tests/window/test_expanding.py | 7 +++++++ pandas/tests/window/test_rolling.py | 7 +++++++ 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 5f79accc5c679..d2ef90945a842 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -1112,6 +1112,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` when using nunique on axis=1 (:issue:`30253`) - Bug in :meth:`GroupBy.quantile` with multiple list-like q value and integer column names (:issue:`30289`) - Bug in :meth:`GroupBy.pct_change` and :meth:`core.groupby.SeriesGroupBy.pct_change` causes ``TypeError`` when ``fill_method`` is ``None`` (:issue:`30463`) +- Bug in :meth:`Rolling.count` and :meth:`Expanding.count` argument ``min_periods`` ignored (:issue:`26996`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index f612826132fd7..a89991dbf14af 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1185,6 +1185,8 @@ def count(self): window = self._get_window() window = min(window, len(obj)) if not self.center else window + min_periods = self.min_periods if self.min_periods is not None else 0 + min_periods = min(min_periods, len(obj)) if not self.center else min_periods results = [] for b in blocks: @@ -1192,7 +1194,7 @@ def count(self): result = self._constructor( result, window=window, - min_periods=0, + min_periods=min_periods, center=self.center, axis=self.axis, closed=self.closed, diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index fc4bd50f25c73..58ad20e473560 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -113,3 +113,10 @@ def test_expanding_axis(self, axis_frame): result = df.expanding(3, axis=axis_frame).sum() tm.assert_frame_equal(result, expected) + + +def test_expanding_count_with_min_periods(): + # GH 26996 + result = Series(range(5)).expanding(min_periods=3).count() + expected = Series([np.nan, np.nan, 3.0, 4.0, 5.0]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 04fab93b71c4a..26606973f5210 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -426,3 +426,10 @@ def test_min_periods1(): result = df["a"].rolling(3, center=True, min_periods=1).max() expected = pd.Series([1.0, 2.0, 2.0, 2.0, 1.0], name="a") tm.assert_series_equal(result, expected) + + +def test_rolling_count_with_min_periods(): + # GH 26996 + result = Series(range(5)).rolling(3, min_periods=3).count() + expected = Series([np.nan, np.nan, 3.0, 3.0, 3.0]) + tm.assert_series_equal(result, expected) From 7f2948cad169c7b95e7a509145b66c1e599da2ba Mon Sep 17 00:00:00 2001 From: HH-MWB <50187675+HH-MWB@users.noreply.github.com> Date: Sat, 11 Jan 2020 18:11:34 -0500 Subject: [PATCH 010/158] replace syntax with f-string (#30919) --- pandas/core/arrays/period.py | 4 ++-- pandas/core/dtypes/common.py | 3 +-- pandas/core/dtypes/dtypes.py | 3 +-- pandas/core/frame.py | 2 +- 4 files changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 8b49c2186dde0..697d759206ff9 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -298,11 +298,11 @@ def __arrow_array__(self, type=None): if self.freqstr != type.freq: raise TypeError( "Not supported to convert PeriodArray to array with different" - " 'freq' ({0} vs {1})".format(self.freqstr, type.freq) + f" 'freq' ({self.freqstr} vs {type.freq})" ) else: raise TypeError( - "Not supported to convert PeriodArray to '{0}' type".format(type) + f"Not supported to convert PeriodArray to '{type}' type" ) period_type = ArrowPeriodType(self.freqstr) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 5a007f28d63cb..f62f03be9b732 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -194,12 +194,11 @@ def ensure_python_int(value: Union[int, np.integer]) -> int: """ if not is_scalar(value): raise TypeError(f"Value needs to be a scalar value, was type {type(value)}") - msg = "Wrong type {} for value {}" try: new_value = int(value) assert new_value == value except (TypeError, ValueError, AssertionError): - raise TypeError(msg.format(type(value), value)) + raise TypeError(f"Wrong type {type(value)} for value {value}") return new_value diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 466ed815e8e5a..93522abc3a48f 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -435,12 +435,11 @@ def __eq__(self, other: Any) -> bool: return hash(self) == hash(other) def __repr__(self) -> str_type: - tpl = "CategoricalDtype(categories={data}ordered={ordered})" if self.categories is None: data = "None, " else: data = self.categories._format_data(name=type(self).__name__) - return tpl.format(data=data, ordered=self.ordered) + return f"CategoricalDtype(categories={data}ordered={self.ordered})" @staticmethod def _hash_categories(categories, ordered: Ordered = True) -> int: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5ad133f9e21a4..676b78573399c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2431,7 +2431,7 @@ def _verbose_repr(): dtype = self.dtypes.iloc[i] col = pprint_thing(col) - line_no = _put_str(" {num}".format(num=i), space_num) + line_no = _put_str(f" {i}", space_num) count = "" if show_counts: count = counts.iloc[i] From 939e7ddc6e75519f19ae98e30bafe9b9e3c21e46 Mon Sep 17 00:00:00 2001 From: Dina Date: Sun, 12 Jan 2020 01:12:33 +0200 Subject: [PATCH 011/158] CLN: F-strings (#30916) --- pandas/core/arrays/timedeltas.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index c34d14f15075c..516a271042c9b 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -43,8 +43,6 @@ from pandas.tseries.frequencies import to_offset from pandas.tseries.offsets import Tick -_BAD_DTYPE = "dtype {dtype} cannot be converted to timedelta64[ns]" - def _is_convertible_to_td(key): return isinstance(key, (Tick, timedelta, np.timedelta64, str)) @@ -1064,7 +1062,7 @@ def _validate_td64_dtype(dtype): raise ValueError(msg) if not is_dtype_equal(dtype, _TD_DTYPE): - raise ValueError(_BAD_DTYPE.format(dtype=dtype)) + raise ValueError(f"dtype {dtype} cannot be converted to timedelta64[ns]") return dtype From b47a454b2022218b78fc68cedb3eb009c58c8bfe Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 11 Jan 2020 15:16:40 -0800 Subject: [PATCH 012/158] DOC: Fixture docs in pandas/conftest.py (#30917) --- pandas/conftest.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 3eab2186ccb94..0c964452df5da 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -65,25 +65,28 @@ def pytest_runtest_setup(item): pytest.skip("skipping high memory test since --run-high-memory was not set") -# Configurations for all tests and all test modules - - @pytest.fixture(autouse=True) def configure_tests(): + """ + Configure settings for all tests and test modules. + """ pd.set_option("chained_assignment", "raise") -# For running doctests: make np and pd names available - - @pytest.fixture(autouse=True) def add_imports(doctest_namespace): + """ + Make `np` and `pd` names available for doctests. + """ doctest_namespace["np"] = np doctest_namespace["pd"] = pd @pytest.fixture(params=["bsr", "coo", "csc", "csr", "dia", "dok", "lil"]) def spmatrix(request): + """ + Yields scipy sparse matrix classes. + """ from scipy import sparse return getattr(sparse, request.param + "_matrix") @@ -92,8 +95,8 @@ def spmatrix(request): @pytest.fixture(params=[0, 1, "index", "columns"], ids=lambda x: f"axis {repr(x)}") def axis(request): """ - Fixture for returning the axis numbers of a DataFrame. - """ + Fixture for returning the axis numbers of a DataFrame. + """ return request.param @@ -237,6 +240,10 @@ def all_boolean_reductions(request): @pytest.fixture(params=list(_cython_table)) def cython_table_items(request): + """ + Yields a tuple of a function and its corresponding name. Correspond to + the list of aggregator "Cython functions" used on selected table items. + """ return request.param @@ -337,6 +344,9 @@ def writable(request): @pytest.fixture(scope="module") def datetime_tz_utc(): + """ + Yields the UTC timezone object from the datetime module. + """ return timezone.utc @@ -358,6 +368,9 @@ def join_type(request): @pytest.fixture def strict_data_files(pytestconfig): + """ + Returns the configuration for the test setting `--strict-data-files`. + """ return pytestconfig.getoption("--strict-data-files") From 75ecfa448e2272aedb2352a7ee7d8bb7a8123b3e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 11 Jan 2020 15:53:35 -0800 Subject: [PATCH 013/158] CLN: remove unnecesary _date_check_type (#30932) The check doesn't do anything, and we still a raise KeyError anyways --- pandas/_libs/index.pyx | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index ac8172146d351..28d269a9a809e 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -447,7 +447,6 @@ cdef class DatetimeEngine(Int64Engine): conv = maybe_datetimelike_to_i8(val) loc = values.searchsorted(conv, side='left') except TypeError: - self._date_check_type(val) raise KeyError(val) if loc == len(values) or values[loc] != conv: @@ -470,12 +469,6 @@ cdef class DatetimeEngine(Int64Engine): val = maybe_datetimelike_to_i8(val) return self.mapping.get_item(val) except (TypeError, ValueError): - self._date_check_type(val) - raise KeyError(val) - - cdef inline _date_check_type(self, object val): - hash(val) - if not util.is_integer_object(val): raise KeyError(val) def get_indexer(self, values): From 044559a7a157645934ab78391aa5de811af5be59 Mon Sep 17 00:00:00 2001 From: Galuh Sahid Date: Sun, 12 Jan 2020 17:07:21 +0700 Subject: [PATCH 014/158] DOC: Fix SS03 docstring error (#30939) xref: https://github.com/pandas-dev/pandas/issues/27977 https://github.com/pandas-dev/pandas/issues/30733 --- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/window/indexers.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index d7cabbabddf95..d7c508c890a46 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -920,7 +920,7 @@ def freq(self, value): @property def freqstr(self): """ - Return the frequency object as a string if its set, otherwise None + Return the frequency object as a string if its set, otherwise None. """ if self.freq is None: return None diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index 0fa24a0ba1b5a..921cdb3c2523f 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -32,7 +32,7 @@ class BaseIndexer: - """Base class for window bounds calculations""" + """Base class for window bounds calculations.""" def __init__( self, index_array: Optional[np.ndarray] = None, window_size: int = 0, **kwargs, From 28e909c63daa451e0f70c6cc15c7ad644adc1979 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sun, 12 Jan 2020 14:32:15 +0000 Subject: [PATCH 015/158] TYP: type up parts of series.py (#30761) --- pandas/core/generic.py | 1 - pandas/core/series.py | 117 +++++++++++++++++++++++------------------ 2 files changed, 67 insertions(+), 51 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0116207675889..03e86758b64ed 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4121,7 +4121,6 @@ def add_suffix(self: FrameOrSeries, suffix: str) -> FrameOrSeries: def sort_values( self, - by=None, axis=0, ascending=True, inplace: bool_t = False, diff --git a/pandas/core/series.py b/pandas/core/series.py index 3e1f011fde51a..ed338700f1011 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4,7 +4,18 @@ from io import StringIO from shutil import get_terminal_size from textwrap import dedent -from typing import IO, Any, Callable, Hashable, List, Optional +from typing import ( + IO, + TYPE_CHECKING, + Any, + Callable, + Hashable, + Iterable, + List, + Optional, + Tuple, + Type, +) import warnings import numpy as np @@ -12,6 +23,7 @@ from pandas._config import get_option from pandas._libs import index as libindex, lib, reshape, tslibs +from pandas._typing import Label from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution from pandas.util._validators import validate_bool_kwarg, validate_percentile @@ -80,6 +92,9 @@ import pandas.io.formats.format as fmt import pandas.plotting +if TYPE_CHECKING: + from pandas.core.frame import DataFrame + __all__ = ["Series"] _shared_doc_kwargs = dict( @@ -356,11 +371,11 @@ def _init_dict(self, data, index=None, dtype=None): # ---------------------------------------------------------------------- @property - def _constructor(self): + def _constructor(self) -> Type["Series"]: return Series @property - def _constructor_expanddim(self): + def _constructor_expanddim(self) -> Type["DataFrame"]: from pandas.core.frame import DataFrame return DataFrame @@ -372,7 +387,7 @@ def _can_hold_na(self): _index = None - def _set_axis(self, axis, labels, fastpath=False): + def _set_axis(self, axis, labels, fastpath=False) -> None: """ Override generic, we want to set the _typ here. """ @@ -517,7 +532,7 @@ def __len__(self) -> int: """ return len(self._data) - def view(self, dtype=None): + def view(self, dtype=None) -> "Series": """ Create a new view of the Series. @@ -729,7 +744,7 @@ def __array__(self, dtype=None) -> np.ndarray: # ---------------------------------------------------------------------- - def _unpickle_series_compat(self, state): + def _unpickle_series_compat(self, state) -> None: if isinstance(state, dict): self._data = state["_data"] self.name = state["name"] @@ -760,7 +775,7 @@ def _unpickle_series_compat(self, state): # indexers @property - def axes(self): + def axes(self) -> List[Index]: """ Return a list of the row axis labels. """ @@ -770,7 +785,7 @@ def axes(self): # Indexing Methods @Appender(generic.NDFrame.take.__doc__) - def take(self, indices, axis=0, is_copy=False, **kwargs): + def take(self, indices, axis=0, is_copy=False, **kwargs) -> "Series": nv.validate_take(tuple(), kwargs) indices = ensure_platform_int(indices) @@ -816,7 +831,7 @@ def _ixs(self, i: int, axis: int = 0): else: return values[i] - def _slice(self, slobj: slice, axis: int = 0, kind=None): + def _slice(self, slobj: slice, axis: int = 0, kind=None) -> "Series": slobj = self.index._convert_slice_indexer(slobj, kind=kind or "getitem") return self._get_values(slobj) @@ -1100,7 +1115,7 @@ def _set_value(self, label, value, takeable: bool = False): def _is_mixed_type(self): return False - def repeat(self, repeats, axis=None): + def repeat(self, repeats, axis=None) -> "Series": """ Repeat elements of a Series. @@ -1425,7 +1440,7 @@ def to_markdown( # ---------------------------------------------------------------------- - def items(self): + def items(self) -> Iterable[Tuple[Label, Any]]: """ Lazily iterate over (index, value) tuples. @@ -1455,13 +1470,13 @@ def items(self): return zip(iter(self.index), iter(self)) @Appender(items.__doc__) - def iteritems(self): + def iteritems(self) -> Iterable[Tuple[Label, Any]]: return self.items() # ---------------------------------------------------------------------- # Misc public methods - def keys(self): + def keys(self) -> Index: """ Return alias for index. @@ -1507,7 +1522,7 @@ def to_dict(self, into=dict): into_c = com.standardize_mapping(into) return into_c(self.items()) - def to_frame(self, name=None): + def to_frame(self, name=None) -> "DataFrame": """ Convert Series to DataFrame. @@ -1539,7 +1554,7 @@ def to_frame(self, name=None): return df - def _set_name(self, name, inplace=False): + def _set_name(self, name, inplace=False) -> "Series": """ Set the Series name. @@ -1681,7 +1696,7 @@ def count(self, level=None): out = np.bincount(obs, minlength=len(lev) or None) return self._constructor(out, index=lev, dtype="int64").__finalize__(self) - def mode(self, dropna=True): + def mode(self, dropna=True) -> "Series": """ Return the mode(s) of the dataset. @@ -1766,7 +1781,7 @@ def unique(self): result = super().unique() return result - def drop_duplicates(self, keep="first", inplace=False): + def drop_duplicates(self, keep="first", inplace=False) -> "Series": """ Return Series with duplicate values removed. @@ -1843,7 +1858,7 @@ def drop_duplicates(self, keep="first", inplace=False): """ return super().drop_duplicates(keep=keep, inplace=inplace) - def duplicated(self, keep="first"): + def duplicated(self, keep="first") -> "Series": """ Indicate duplicate Series values. @@ -2062,7 +2077,7 @@ def idxmax(self, axis=0, skipna=True, *args, **kwargs): return np.nan return self.index[i] - def round(self, decimals=0, *args, **kwargs): + def round(self, decimals=0, *args, **kwargs) -> "Series": """ Round each value in a Series to the given number of decimals. @@ -2157,7 +2172,7 @@ def quantile(self, q=0.5, interpolation="linear"): # scalar return result.iloc[0] - def corr(self, other, method="pearson", min_periods=None): + def corr(self, other, method="pearson", min_periods=None) -> float: """ Compute correlation with `other` Series, excluding missing values. @@ -2210,7 +2225,7 @@ def corr(self, other, method="pearson", min_periods=None): f"'{method}' was supplied" ) - def cov(self, other, min_periods=None): + def cov(self, other, min_periods=None) -> float: """ Compute covariance with Series, excluding missing values. @@ -2239,7 +2254,7 @@ def cov(self, other, min_periods=None): return np.nan return nanops.nancov(this.values, other.values, min_periods=min_periods) - def diff(self, periods=1): + def diff(self, periods=1) -> "Series": """ First discrete difference of element. @@ -2303,7 +2318,7 @@ def diff(self, periods=1): result = algorithms.diff(com.values_from_object(self), periods) return self._constructor(result, index=self.index).__finalize__(self) - def autocorr(self, lag=1): + def autocorr(self, lag=1) -> float: """ Compute the lag-N autocorrelation. @@ -2446,7 +2461,7 @@ def searchsorted(self, value, side="left", sorter=None): # ------------------------------------------------------------------- # Combination - def append(self, to_append, ignore_index=False, verify_integrity=False): + def append(self, to_append, ignore_index=False, verify_integrity=False) -> "Series": """ Concatenate two or more Series. @@ -2523,8 +2538,10 @@ def append(self, to_append, ignore_index=False, verify_integrity=False): to_concat.extend(to_append) else: to_concat = [self, to_append] - return concat( - to_concat, ignore_index=ignore_index, verify_integrity=verify_integrity + return self._ensure_type( + concat( + to_concat, ignore_index=ignore_index, verify_integrity=verify_integrity + ) ) def _binop(self, other, func, level=None, fill_value=None): @@ -2566,7 +2583,7 @@ def _binop(self, other, func, level=None, fill_value=None): ret = ops._construct_result(self, result, new_index, name) return ret - def combine(self, other, func, fill_value=None): + def combine(self, other, func, fill_value=None) -> "Series": """ Combine the Series with a Series or scalar according to `func`. @@ -2663,7 +2680,7 @@ def combine(self, other, func, fill_value=None): new_values = try_cast_to_ea(self._values, new_values) return self._constructor(new_values, index=new_index, name=new_name) - def combine_first(self, other): + def combine_first(self, other) -> "Series": """ Combine Series values, choosing the calling Series's values first. @@ -2703,7 +2720,7 @@ def combine_first(self, other): return this.where(notna(this), other) - def update(self, other): + def update(self, other) -> None: """ Modify Series in place using non-NA values from passed Series. Aligns on index. @@ -2762,10 +2779,10 @@ def sort_values( self, axis=0, ascending=True, - inplace=False, - kind="quicksort", - na_position="last", - ignore_index=False, + inplace: bool = False, + kind: str = "quicksort", + na_position: str = "last", + ignore_index: bool = False, ): """ Sort by the values. @@ -3117,7 +3134,7 @@ def sort_index( else: return result.__finalize__(self) - def argsort(self, axis=0, kind="quicksort", order=None): + def argsort(self, axis=0, kind="quicksort", order=None) -> "Series": """ Override ndarray.argsort. Argsorts the value, omitting NA/null values, and places the result in the same locations as the non-NA values. @@ -3155,7 +3172,7 @@ def argsort(self, axis=0, kind="quicksort", order=None): np.argsort(values, kind=kind), index=self.index, dtype="int64" ).__finalize__(self) - def nlargest(self, n=5, keep="first"): + def nlargest(self, n=5, keep="first") -> "Series": """ Return the largest `n` elements. @@ -3253,7 +3270,7 @@ def nlargest(self, n=5, keep="first"): """ return algorithms.SelectNSeries(self, n=n, keep=keep).nlargest() - def nsmallest(self, n=5, keep="first"): + def nsmallest(self, n=5, keep="first") -> "Series": """ Return the smallest `n` elements. @@ -3350,7 +3367,7 @@ def nsmallest(self, n=5, keep="first"): """ return algorithms.SelectNSeries(self, n=n, keep=keep).nsmallest() - def swaplevel(self, i=-2, j=-1, copy=True): + def swaplevel(self, i=-2, j=-1, copy=True) -> "Series": """ Swap levels i and j in a :class:`MultiIndex`. @@ -3373,7 +3390,7 @@ def swaplevel(self, i=-2, j=-1, copy=True): self ) - def reorder_levels(self, order): + def reorder_levels(self, order) -> "Series": """ Rearrange index levels using input order. @@ -3497,7 +3514,7 @@ def unstack(self, level=-1, fill_value=None): # ---------------------------------------------------------------------- # function application - def map(self, arg, na_action=None): + def map(self, arg, na_action=None) -> "Series": """ Map values of Series according to input correspondence. @@ -3575,7 +3592,7 @@ def map(self, arg, na_action=None): new_values = super()._map_values(arg, na_action=na_action) return self._constructor(new_values, index=self.index).__finalize__(self) - def _gotitem(self, key, ndim, subset=None): + def _gotitem(self, key, ndim, subset=None) -> "Series": """ Sub-classes to define. Return a sliced object. @@ -3983,7 +4000,7 @@ def drop( level=None, inplace=False, errors="raise", - ): + ) -> "Series": """ Return Series with specified index labels removed. @@ -4124,7 +4141,7 @@ def replace( ) @Appender(generic._shared_docs["shift"] % _shared_doc_kwargs) - def shift(self, periods=1, freq=None, axis=0, fill_value=None): + def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "Series": return super().shift( periods=periods, freq=freq, axis=axis, fill_value=fill_value ) @@ -4183,7 +4200,7 @@ def memory_usage(self, index=True, deep=False): v += self.index.memory_usage(deep=deep) return v - def isin(self, values): + def isin(self, values) -> "Series": """ Check whether `values` are contained in Series. @@ -4239,7 +4256,7 @@ def isin(self, values): result = algorithms.isin(self, values) return self._constructor(result, index=self.index).__finalize__(self) - def between(self, left, right, inclusive=True): + def between(self, left, right, inclusive=True) -> "Series": """ Return boolean Series equivalent to left <= series <= right. @@ -4315,19 +4332,19 @@ def between(self, left, right, inclusive=True): return lmask & rmask @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) - def isna(self): + def isna(self) -> "Series": return super().isna() @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) - def isnull(self): + def isnull(self) -> "Series": return super().isnull() @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs) - def notna(self): + def notna(self) -> "Series": return super().notna() @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs) - def notnull(self): + def notnull(self) -> "Series": return super().notnull() def dropna(self, axis=0, inplace=False, how=None): @@ -4421,7 +4438,7 @@ def dropna(self, axis=0, inplace=False, how=None): # ---------------------------------------------------------------------- # Time series-oriented methods - def to_timestamp(self, freq=None, how="start", copy=True): + def to_timestamp(self, freq=None, how="start", copy=True) -> "Series": """ Cast to DatetimeIndex of Timestamps, at *beginning* of period. @@ -4446,7 +4463,7 @@ def to_timestamp(self, freq=None, how="start", copy=True): new_index = self.index.to_timestamp(freq=freq, how=how) return self._constructor(new_values, index=new_index).__finalize__(self) - def to_period(self, freq=None, copy=True): + def to_period(self, freq=None, copy=True) -> "Series": """ Convert Series from DatetimeIndex to PeriodIndex with desired frequency (inferred from index if not passed). From 4e2546d89260fda592332a3988573c26edc7152c Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Sun, 12 Jan 2020 19:04:18 +0200 Subject: [PATCH 016/158] STY: wrong placed space in strings (#30940) --- pandas/tests/frame/test_missing.py | 4 ++-- pandas/tests/frame/test_repr_info.py | 6 +++--- pandas/tests/frame/test_reshape.py | 4 ++-- pandas/tests/groupby/test_grouping.py | 5 +---- pandas/tests/plotting/test_datetimelike.py | 4 ++-- pandas/tests/plotting/test_misc.py | 8 ++++---- pandas/tests/reshape/merge/test_join.py | 4 ++-- pandas/tests/reshape/merge/test_merge.py | 15 +++++++-------- pandas/tests/reshape/test_concat.py | 8 ++++---- pandas/tests/test_strings.py | 8 ++++---- pandas/util/_validators.py | 2 +- 11 files changed, 32 insertions(+), 36 deletions(-) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 2e6759cb1a238..ae0516dd29a1f 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -670,8 +670,8 @@ def test_fillna_invalid_value(self, float_frame): float_frame.fillna((1, 2)) # frame with series msg = ( - '"value" parameter must be a scalar, dict or Series, but you' - ' passed a "DataFrame"' + '"value" parameter must be a scalar, dict or Series, but you ' + 'passed a "DataFrame"' ) with pytest.raises(TypeError, match=msg): float_frame.iloc[:, 0].fillna(float_frame) diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 05bdec4a3a4d2..49e6fe4940e18 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -164,13 +164,13 @@ def test_repr_column_name_unicode_truncation_bug(self): "Id": [7117434], "StringCol": ( "Is it possible to modify drop plot code" - " so that the output graph is displayed " + "so that the output graph is displayed " "in iphone simulator, Is it possible to " "modify drop plot code so that the " "output graph is \xe2\x80\xa8displayed " "in iphone simulator.Now we are adding " - "the CSV file externally. I want to Call" - " the File through the code.." + "the CSV file externally. I want to Call " + "the File through the code.." ), } ) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 56a0c8cf4f5bd..60b7611c8b9be 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -424,8 +424,8 @@ def test_stack_mixed_levels(self): # When mixed types are passed and the ints are not level # names, raise msg = ( - "level should contain all level names or all level numbers, not" - " a mixture of the two" + "level should contain all level names or all level numbers, not " + "a mixture of the two" ) with pytest.raises(ValueError, match=msg): df2.stack(level=["animal", 0]) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 70ba21d89d22f..e424913804c33 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -725,10 +725,7 @@ def test_get_group(self): g.get_group("foo") with pytest.raises(ValueError, match=msg): g.get_group(("foo")) - msg = ( - "must supply a same-length tuple to get_group with multiple" - " grouping keys" - ) + msg = "must supply a same-length tuple to get_group with multiple grouping keys" with pytest.raises(ValueError, match=msg): g.get_group(("foo", "bar", "baz")) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 8f855fd0c6cff..fb86b600d3d3c 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -121,8 +121,8 @@ def test_both_style_and_color(self): ts = tm.makeTimeSeries() msg = ( "Cannot pass 'style' string with a color symbol and 'color' " - "keyword argument. Please use one or the other or pass 'style'" - " without a color symbol" + "keyword argument. Please use one or the other or pass 'style' " + "without a color symbol" ) with pytest.raises(ValueError, match=msg): ts.plot(style="b-", color="#000099") diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index c8aa1f23ccf1f..228c84528e882 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -319,8 +319,8 @@ def test_subplot_titles(self, iris): # Case len(title) > len(df) msg = ( - "The length of `title` must equal the number of columns if" - " using `title` of type `list` and `subplots=True`" + "The length of `title` must equal the number of columns if " + "using `title` of type `list` and `subplots=True`" ) with pytest.raises(ValueError, match=msg): df.plot(subplots=True, title=title + ["kittens > puppies"]) @@ -331,8 +331,8 @@ def test_subplot_titles(self, iris): # Case subplots=False and title is of type list msg = ( - "Using `title` of type `list` is not supported unless" - " `subplots=True` is passed" + "Using `title` of type `list` is not supported unless " + "`subplots=True` is passed" ) with pytest.raises(ValueError, match=msg): df.plot(subplots=False, title=title) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index a660acb143433..7020d373caf82 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -212,8 +212,8 @@ def test_join_on(self): source_copy = source.copy() source_copy["A"] = 0 msg = ( - "You are trying to merge on float64 and object columns. If" - " you wish to proceed you should use pd.concat" + "You are trying to merge on float64 and object columns. If " + "you wish to proceed you should use pd.concat" ) with pytest.raises(ValueError, match=msg): target.join(source_copy, on="A") diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 8e0c4766056d3..30c440035d48e 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -201,8 +201,8 @@ def test_merge_misspecified(self): merge(self.left, self.right, right_index=True) msg = ( - 'Can only pass argument "on" OR "left_on" and "right_on", not' - " a combination of both" + 'Can only pass argument "on" OR "left_on" and "right_on", not ' + "a combination of both" ) with pytest.raises(pd.errors.MergeError, match=msg): merge(self.left, self.left, left_on="key", on="key") @@ -1013,10 +1013,9 @@ def test_indicator(self): df_badcolumn = DataFrame({"col1": [1, 2], i: [2, 2]}) msg = ( - "Cannot use `indicator=True` option when data contains a" - " column named {}|" - "Cannot use name of an existing column for indicator" - " column" + "Cannot use `indicator=True` option when data contains a " + "column named {}|" + "Cannot use name of an existing column for indicator column" ).format(i) with pytest.raises(ValueError, match=msg): merge(df1, df_badcolumn, on="col1", how="outer", indicator=True) @@ -1235,8 +1234,8 @@ def test_validation(self): ) msg = ( - "Merge keys are not unique in either left or right dataset;" - " not a one-to-one merge" + "Merge keys are not unique in either left or right dataset; " + "not a one-to-one merge" ) with pytest.raises(MergeError, match=msg): merge(left, right, on="a", validate="1:1") diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 990669f1ae13a..b3b2c5a05c6ad 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -198,8 +198,8 @@ def test_concatlike_same_dtypes(self): # cannot append non-index msg = ( - r"cannot concatenate object of type '.+';" - " only Series and DataFrame objs are valid" + r"cannot concatenate object of type '.+'; " + "only Series and DataFrame objs are valid" ) with pytest.raises(TypeError, match=msg): pd.Series(vals1).append(vals2) @@ -1866,8 +1866,8 @@ def test_concat_invalid(self): # trying to concat a ndframe with a non-ndframe df1 = tm.makeCustomDataframe(10, 2) msg = ( - "cannot concatenate object of type '{}';" - " only Series and DataFrame objs are valid" + "cannot concatenate object of type '{}'; " + "only Series and DataFrame objs are valid" ) for obj in [1, dict(), [1, 2], (1, 2)]: with pytest.raises(TypeError, match=msg.format(type(obj))): diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 7f3375070d7d9..a92f917820bd0 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3392,8 +3392,8 @@ def test_encode_decode_errors(self): encodeBase = Series(["a", "b", "a\x9d"]) msg = ( - r"'charmap' codec can't encode character '\\x9d' in position 1:" - " character maps to " + r"'charmap' codec can't encode character '\\x9d' in position 1: " + "character maps to " ) with pytest.raises(UnicodeEncodeError, match=msg): encodeBase.str.encode("cp1252") @@ -3406,8 +3406,8 @@ def test_encode_decode_errors(self): decodeBase = Series([b"a", b"b", b"a\x9d"]) msg = ( - "'charmap' codec can't decode byte 0x9d in position 1:" - " character maps to " + "'charmap' codec can't decode byte 0x9d in position 1: " + "character maps to " ) with pytest.raises(UnicodeDecodeError, match=msg): decodeBase.str.decode("cp1252") diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index b69c974661f89..a715094e65e98 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -297,7 +297,7 @@ def validate_axis_style_args(data, args, kwargs, arg_name, method_name): "\n\t'.{method_name}(index=a, columns=b)'.\nUse named " "arguments to remove any ambiguity. In the future, using " "positional arguments for 'index' or 'columns' will raise " - " a 'TypeError'." + "a 'TypeError'." ) warnings.warn(msg.format(method_name=method_name), FutureWarning, stacklevel=4) out[data._AXIS_NAMES[0]] = args[0] From 45580a213c9e5e2c69fa39840c3583f6f5160bed Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Mon, 13 Jan 2020 01:31:16 -0700 Subject: [PATCH 017/158] DOC: Fix whatsnew contributors section (#30926) --- doc/source/whatsnew/v0.25.3.rst | 2 +- doc/source/whatsnew/v1.0.0.rst | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.3.rst b/doc/source/whatsnew/v0.25.3.rst index f73a3f956f42e..f7f54198a0f82 100644 --- a/doc/source/whatsnew/v0.25.3.rst +++ b/doc/source/whatsnew/v0.25.3.rst @@ -19,4 +19,4 @@ Groupby/resample/rolling Contributors ~~~~~~~~~~~~ -.. contributors:: v0.25.2..HEAD +.. contributors:: v0.25.2..v0.25.3 diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 5f79accc5c679..afbc113e98957 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -1177,3 +1177,5 @@ Other Contributors ~~~~~~~~~~~~ + +.. contributors:: v0.25.3..v1.0.0rc0 From 439d6298f9af1a6ddb207a6920d47d6e0eb1abe4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 13 Jan 2020 01:48:37 -0800 Subject: [PATCH 018/158] CI: numpydev changed double to single quote (#30952) --- pandas/tests/dtypes/test_common.py | 3 ++- pandas/tests/frame/methods/test_to_records.py | 2 +- pandas/tests/indexes/interval/test_astype.py | 2 +- pandas/tests/indexes/interval/test_constructors.py | 2 +- pandas/tests/io/parser/test_dtypes.py | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index ce925891f62c0..097e83d93ee71 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -668,7 +668,8 @@ def test__get_dtype(input_param, result): (None, "Cannot deduce dtype from null object"), (1, "data type not understood"), (1.2, "data type not understood"), - ("random string", 'data type "random string" not understood'), + # numpy dev changed from double-quotes to single quotes + ("random string", "data type [\"']random string[\"'] not understood"), (pd.DataFrame([1, 2]), "data type not understood"), ], ) diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py index 54a3affdc3024..d0181f0309af1 100644 --- a/pandas/tests/frame/methods/test_to_records.py +++ b/pandas/tests/frame/methods/test_to_records.py @@ -235,7 +235,7 @@ def test_to_records_with_categorical(self): # Check that bad types raise ( dict(index=False, column_dtypes={"A": "int32", "B": "foo"}), - (TypeError, 'data type "foo" not understood'), + (TypeError, "data type [\"']foo[\"'] not understood"), ), ], ) diff --git a/pandas/tests/indexes/interval/test_astype.py b/pandas/tests/indexes/interval/test_astype.py index 2b1742d58b77e..c94af6c0d533e 100644 --- a/pandas/tests/indexes/interval/test_astype.py +++ b/pandas/tests/indexes/interval/test_astype.py @@ -67,7 +67,7 @@ def test_astype_cannot_cast(self, index, dtype): index.astype(dtype) def test_astype_invalid_dtype(self, index): - msg = 'data type "fake_dtype" not understood' + msg = "data type [\"']fake_dtype[\"'] not understood" with pytest.raises(TypeError, match=msg): index.astype("fake_dtype") diff --git a/pandas/tests/indexes/interval/test_constructors.py b/pandas/tests/indexes/interval/test_constructors.py index 13a45df743cf5..837c124db2bed 100644 --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -164,7 +164,7 @@ def test_generic_errors(self, constructor): constructor(dtype="int64", **filler) # invalid dtype - msg = 'data type "invalid" not understood' + msg = "data type [\"']invalid[\"'] not understood" with pytest.raises(TypeError, match=msg): constructor(dtype="invalid", **filler) diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index 2133f8116a95e..d08c86bf2ae75 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -79,7 +79,7 @@ def test_invalid_dtype_per_column(all_parsers): 3,4.5 4,5.5""" - with pytest.raises(TypeError, match='data type "foo" not understood'): + with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"): parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) From 46c2864c34eee0cd94c8842353331e293b0f2004 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 13 Jan 2020 03:00:11 -0800 Subject: [PATCH 019/158] CLN: leftover ix checks (#30951) --- pandas/core/generic.py | 2 +- pandas/core/indexes/base.py | 29 ++++++++++++++--------------- pandas/core/indexes/datetimelike.py | 6 +++--- pandas/core/indexes/datetimes.py | 4 ++-- pandas/core/indexes/numeric.py | 8 ++++---- pandas/core/indexes/period.py | 4 ++-- pandas/core/indexes/timedeltas.py | 4 ++-- 7 files changed, 28 insertions(+), 29 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 03e86758b64ed..04ce424edbee4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -177,7 +177,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): ] _internal_names_set: Set[str] = set(_internal_names) _accessors: Set[str] = set() - _deprecations: FrozenSet[str] = frozenset(["get_values", "ix"]) + _deprecations: FrozenSet[str] = frozenset(["get_values"]) _metadata: List[str] = [] _is_copy = None _data: BlockManager diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ca929b188dc33..62e3fd28f6684 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2829,12 +2829,12 @@ def _filter_indexer_tolerance(self, target, indexer, tolerance): Parameters ---------- key : label of the slice bound - kind : {'ix', 'loc', 'getitem', 'iloc'} or None + kind : {'loc', 'getitem', 'iloc'} or None """ @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): - assert kind in ["ix", "loc", "getitem", "iloc", None] + assert kind in ["loc", "getitem", "iloc", None] if kind == "iloc": return self._validate_indexer("positional", key, kind) @@ -2842,11 +2842,11 @@ def _convert_scalar_indexer(self, key, kind=None): if len(self) and not isinstance(self, ABCMultiIndex): # we can raise here if we are definitive that this - # is positional indexing (eg. .ix on with a float) + # is positional indexing (eg. .loc on with a float) # or label indexing if we are using a type able # to be represented in the index - if kind in ["getitem", "ix"] and is_float(key): + if kind in ["getitem"] and is_float(key): if not self.is_floating(): return self._invalid_indexer("label", key) @@ -2882,12 +2882,12 @@ def _convert_scalar_indexer(self, key, kind=None): Parameters ---------- key : label of the slice bound - kind : {'ix', 'loc', 'getitem', 'iloc'} or None + kind : {'loc', 'getitem', 'iloc'} or None """ @Appender(_index_shared_docs["_convert_slice_indexer"]) def _convert_slice_indexer(self, key: slice, kind=None): - assert kind in ["ix", "loc", "getitem", "iloc", None] + assert kind in ["loc", "getitem", "iloc", None] # validate iloc if kind == "iloc": @@ -3026,7 +3026,7 @@ def _convert_index_indexer(self, keyarr): @Appender(_index_shared_docs["_convert_list_indexer"]) def _convert_list_indexer(self, keyarr, kind=None): if ( - kind in [None, "iloc", "ix"] + kind in [None, "iloc"] and is_integer_dtype(keyarr) and not self.is_floating() and not isinstance(keyarr, ABCPeriodIndex) @@ -4704,7 +4704,7 @@ def _validate_indexer(self, form, key, kind): If we are positional indexer, validate that we have appropriate typed bounds must be an integer. """ - assert kind in ["ix", "loc", "getitem", "iloc"] + assert kind in ["loc", "getitem", "iloc"] if key is None: pass @@ -4725,7 +4725,7 @@ def _validate_indexer(self, form, key, kind): ---------- label : object side : {'left', 'right'} - kind : {'ix', 'loc', 'getitem'} + kind : {'loc', 'getitem'} or None Returns ------- @@ -4738,15 +4738,14 @@ def _validate_indexer(self, form, key, kind): @Appender(_index_shared_docs["_maybe_cast_slice_bound"]) def _maybe_cast_slice_bound(self, label, side, kind): - assert kind in ["ix", "loc", "getitem", None] + assert kind in ["loc", "getitem", None] # We are a plain index here (sub-class override this method if they # wish to have special treatment for floats/ints, e.g. Float64Index and # datetimelike Indexes # reject them if is_float(label): - if not (kind in ["ix"] and (self.holds_integer() or self.is_floating())): - self._invalid_indexer("slice", label) + self._invalid_indexer("slice", label) # we are trying to find integer bounds on a non-integer based index # this is rejected (generally .loc gets you here) @@ -4780,14 +4779,14 @@ def get_slice_bound(self, label, side, kind): ---------- label : object side : {'left', 'right'} - kind : {'ix', 'loc', 'getitem'} + kind : {'loc', 'getitem'} or None Returns ------- int Index of label. """ - assert kind in ["ix", "loc", "getitem", None] + assert kind in ["loc", "getitem", None] if side not in ("left", "right"): raise ValueError( @@ -4847,7 +4846,7 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): If None, defaults to the end. step : int, defaults None If None, defaults to 1. - kind : {'ix', 'loc', 'getitem'} or None + kind : {'loc', 'getitem'} or None Returns ------- diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index c4dac9d1c4a11..9eb5ed7cb0911 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -388,10 +388,10 @@ def _convert_scalar_indexer(self, key, kind=None): Parameters ---------- key : label of the slice bound - kind : {'ix', 'loc', 'getitem', 'iloc'} or None + kind : {'loc', 'getitem', 'iloc'} or None """ - assert kind in ["ix", "loc", "getitem", "iloc", None] + assert kind in ["loc", "getitem", "iloc", None] # we don't allow integer/float indexing for loc # we don't allow float indexing for ix/getitem @@ -400,7 +400,7 @@ def _convert_scalar_indexer(self, key, kind=None): is_flt = is_float(key) if kind in ["loc"] and (is_int or is_flt): self._invalid_indexer("index", key) - elif kind in ["ix", "getitem"] and is_flt: + elif kind in ["getitem"] and is_flt: self._invalid_indexer("index", key) return super()._convert_scalar_indexer(key, kind=kind) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 2241921e94694..75515949d1855 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -742,7 +742,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): ---------- label : object side : {'left', 'right'} - kind : {'ix', 'loc', 'getitem'} + kind : {'loc', 'getitem'} or None Returns ------- @@ -752,7 +752,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): ----- Value of `side` parameter should be validated in caller. """ - assert kind in ["ix", "loc", "getitem", None] + assert kind in ["loc", "getitem", None] if is_float(label) or isinstance(label, time) or is_integer(label): self._invalid_indexer("slice", label) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index b9b44284edaa9..9a3a021bd801a 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -99,7 +99,7 @@ def _validate_dtype(cls, dtype: Dtype) -> None: @Appender(_index_shared_docs["_maybe_cast_slice_bound"]) def _maybe_cast_slice_bound(self, label, side, kind): - assert kind in ["ix", "loc", "getitem", None] + assert kind in ["loc", "getitem", None] # we will try to coerce to integers return self._maybe_cast_indexer(label) @@ -260,7 +260,7 @@ def asi8(self) -> np.ndarray: @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): - assert kind in ["ix", "loc", "getitem", "iloc", None] + assert kind in ["loc", "getitem", "iloc", None] # don't coerce ilocs to integers if kind != "iloc": @@ -317,7 +317,7 @@ def asi8(self) -> np.ndarray: @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): - assert kind in ["ix", "loc", "getitem", "iloc", None] + assert kind in ["loc", "getitem", "iloc", None] # don't coerce ilocs to integers if kind != "iloc": @@ -404,7 +404,7 @@ def astype(self, dtype, copy=True): @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): - assert kind in ["ix", "loc", "getitem", "iloc", None] + assert kind in ["loc", "getitem", "iloc", None] if kind == "iloc": return self._validate_indexer("positional", key, kind) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 6ab2e66e05d6e..4e3689078d535 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -625,7 +625,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): ---------- label : object side : {'left', 'right'} - kind : {'ix', 'loc', 'getitem'} + kind : {'loc', 'getitem'} Returns ------- @@ -636,7 +636,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): Value of `side` parameter should be validated in caller. """ - assert kind in ["ix", "loc", "getitem"] + assert kind in ["loc", "getitem"] if isinstance(label, datetime): return Period(label, freq=self.freq) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 1f3182bc83e1d..582c257b50ad0 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -310,13 +310,13 @@ def _maybe_cast_slice_bound(self, label, side, kind): ---------- label : object side : {'left', 'right'} - kind : {'ix', 'loc', 'getitem'} + kind : {'loc', 'getitem'} or None Returns ------- label : object """ - assert kind in ["ix", "loc", "getitem", None] + assert kind in ["loc", "getitem", None] if isinstance(label, str): parsed = Timedelta(label) From bbccf2d8389eb661aad7655eba5c0c7413cc55cf Mon Sep 17 00:00:00 2001 From: Souvik Mandal Date: Mon, 13 Jan 2020 18:35:17 +0530 Subject: [PATCH 020/158] DOC: Move import conventions from wiki to docs #30808 (#30888) --- doc/source/development/code_style.rst | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/doc/source/development/code_style.rst b/doc/source/development/code_style.rst index 2fc2f1fb6ee8d..a295038b5a0bd 100644 --- a/doc/source/development/code_style.rst +++ b/doc/source/development/code_style.rst @@ -127,3 +127,29 @@ For example: value = str f"Unknown recived type, got: '{type(value).__name__}'" + + +Imports (aim for absolute) +========================== + +In Python 3, absolute imports are recommended. In absolute import doing something +like ``import string`` will import the string module rather than ``string.py`` +in the same directory. As much as possible, you should try to write out +absolute imports that show the whole import chain from toplevel pandas. + +Explicit relative imports are also supported in Python 3. But it is not +recommended to use it. Implicit relative imports should never be used +and is removed in Python 3. + +For example: + +:: + + # preferred + import pandas.core.common as com + + # not preferred + from .common import test_base + + # wrong + from common import test_base From 2c76d064835b3f616f858d94a45fae152ea2b510 Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Mon, 13 Jan 2020 13:08:37 +0000 Subject: [PATCH 021/158] DOC: Move couple of deprecations whatsnew to correct section (#30961) --- doc/source/whatsnew/v1.0.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index afbc113e98957..0879189a822f8 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -218,7 +218,6 @@ Other enhancements now preserve those data types with pyarrow >= 0.16.0 (:issue:`20612`, :issue:`28371`). - The ``partition_cols`` argument in :meth:`DataFrame.to_parquet` now accepts a string (:issue:`27117`) - :func:`pandas.read_json` now parses ``NaN``, ``Infinity`` and ``-Infinity`` (:issue:`12213`) -- The ``pandas.np`` submodule is now deprecated. Import numpy directly instead (:issue:`30296`) - :func:`to_parquet` now appropriately handles the ``schema`` argument for user defined schemas in the pyarrow engine. (:issue:`30270`) - DataFrame constructor preserve `ExtensionArray` dtype with `ExtensionArray` (:issue:`11363`) - :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` have gained ``ignore_index`` keyword to be able to reset index after sorting (:issue:`30114`) @@ -226,7 +225,6 @@ Other enhancements - :meth:`DataFrame.drop_duplicates` has gained ``ignore_index`` keyword to reset index (:issue:`30114`) - Added new writer for exporting Stata dta files in version 118, ``StataWriter118``. This format supports exporting strings containing Unicode characters (:issue:`23573`) - :meth:`Series.map` now accepts ``collections.abc.Mapping`` subclasses as a mapper (:issue:`29733`) -- The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30296`) - Added an experimental :attr:`~DataFrame.attrs` for storing global metadata about a dataset (:issue:`29062`) - :meth:`Timestamp.fromisocalendar` is now compatible with python 3.8 and above (:issue:`28115`) - :meth:`DataFrame.to_pickle` and :func:`read_pickle` now accept URL (:issue:`30163`) @@ -707,6 +705,8 @@ Deprecations - ``pandas.SparseArray`` has been deprecated. Use ``pandas.arrays.SparseArray`` (:class:`arrays.SparseArray`) instead. (:issue:`30642`) - The parameter ``is_copy`` of :meth:`DataFrame.take` has been deprecated and will be removed in a future version. (:issue:`27357`) - Support for multi-dimensional indexing (e.g. ``index[:, None]``) on a :class:`Index` is deprecated and will be removed in a future version, convert to a numpy array before indexing instead (:issue:`30588`) +- The ``pandas.np`` submodule is now deprecated. Import numpy directly instead (:issue:`30296`) +- The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30610`) **Selecting Columns from a Grouped DataFrame** From 62d16abd3051cacecfe6307be074acaa00587560 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Mon, 13 Jan 2020 16:49:32 +0200 Subject: [PATCH 022/158] STY: concat strings that should not be seperated (#30942) --- pandas/_libs/algos.pyx | 9 ++-- pandas/_libs/groupby.pyx | 3 +- pandas/_libs/hashing.pyx | 11 +++-- pandas/_libs/indexing.pyx | 5 +- pandas/_libs/sparse.pyx | 6 +-- pandas/_libs/testing.pyx | 6 +-- pandas/_libs/tslibs/timestamps.pyx | 74 ++++++++++++++++++---------- pandas/_libs/window/aggregations.pyx | 3 +- 8 files changed, 68 insertions(+), 49 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 7a2fc9dc7845a..dd1f38ce3a842 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -914,8 +914,7 @@ def rank_1d(rank_t[:] in_arr, ties_method='average', ranks[argsorted[j]] = i + 1 elif tiebreak == TIEBREAK_FIRST: if rank_t is object: - raise ValueError('first not supported for ' - 'non-numeric data') + raise ValueError('first not supported for non-numeric data') else: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = j + 1 @@ -971,8 +970,7 @@ def rank_1d(rank_t[:] in_arr, ties_method='average', ranks[argsorted[j]] = i + 1 elif tiebreak == TIEBREAK_FIRST: if rank_t is object: - raise ValueError('first not supported for ' - 'non-numeric data') + raise ValueError('first not supported for non-numeric data') else: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = j + 1 @@ -1137,8 +1135,7 @@ def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average', ranks[i, argsorted[i, z]] = j + 1 elif tiebreak == TIEBREAK_FIRST: if rank_t is object: - raise ValueError('first not supported ' - 'for non-numeric data') + raise ValueError('first not supported for non-numeric data') else: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = z + 1 diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index abb8a6d388d26..93ea94f7b18fc 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -686,8 +686,7 @@ def _group_ohlc(floating[:, :] out, raise ValueError('Output array must have 4 columns') if K > 1: - raise NotImplementedError("Argument 'values' must have only " - "one dimension") + raise NotImplementedError("Argument 'values' must have only one dimension") out[:] = np.nan with nogil: diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index 5298d8c5ed34e..878da670b2f68 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -51,8 +51,9 @@ def hash_object_array(object[:] arr, object key, object encoding='utf8'): k = key.encode(encoding) kb = k if len(k) != 16: - raise ValueError("key should be a 16-byte string encoded, " - f"got {k} (len {len(k)})") + raise ValueError( + f"key should be a 16-byte string encoded, got {k} (len {len(k)})" + ) n = len(arr) @@ -77,8 +78,10 @@ def hash_object_array(object[:] arr, object key, object encoding='utf8'): hash(val) data = str(val).encode(encoding) else: - raise TypeError(f"{val} of type {type(val)} is not a valid type " - "for hashing, must be string or null") + raise TypeError( + f"{val} of type {type(val)} is not a valid type for hashing, " + "must be string or null" + ) l = len(data) lens[i] = l diff --git a/pandas/_libs/indexing.pyx b/pandas/_libs/indexing.pyx index 01f4fb060d982..cdccdb504571c 100644 --- a/pandas/_libs/indexing.pyx +++ b/pandas/_libs/indexing.pyx @@ -18,6 +18,7 @@ cdef class _NDFrameIndexerBase: if ndim is None: ndim = self._ndim = self.obj.ndim if ndim > 2: - raise ValueError("NDFrameIndexer does not support " - "NDFrame objects with ndim > 2") + raise ValueError( + "NDFrameIndexer does not support NDFrame objects with ndim > 2" + ) return ndim diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index ee83901040b36..3a6dd506b2428 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -72,9 +72,9 @@ cdef class IntIndex(SparseIndex): """ if self.npoints > self.length: - msg = (f"Too many indices. Expected " - f"{self.length} but found {self.npoints}") - raise ValueError(msg) + raise ValueError( + f"Too many indices. Expected {self.length} but found {self.npoints}" + ) # Indices are vacuously ordered and non-negative # if the sequence of indices is empty. diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index 5a30b71a6fea1..0e57b563d4d25 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -127,9 +127,9 @@ cpdef assert_almost_equal(a, b, # classes can't be the same, to raise error assert_class_equal(a, b, obj=obj) - assert has_length(a) and has_length(b), ("Can't compare objects without " - "length, one or both is invalid: " - f"({a}, {b})") + assert has_length(a) and has_length(b), ( + f"Can't compare objects without length, one or both is invalid: ({a}, {b})" + ) if a_is_ndarray and b_is_ndarray: na, nb = a.size, b.size diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index abe7f9e5b4105..36566b55e74ad 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -161,8 +161,7 @@ def round_nsint64(values, mode, freq): # if/elif above should catch all rounding modes defined in enum 'RoundTo': # if flow of control arrives here, it is a bug - raise ValueError("round_nsint64 called with an unrecognized " - "rounding mode") + raise ValueError("round_nsint64 called with an unrecognized rounding mode") # ---------------------------------------------------------------------- @@ -324,8 +323,10 @@ class Timestamp(_Timestamp): Function is not implemented. Use pd.to_datetime(). """ - raise NotImplementedError("Timestamp.strptime() is not implemented." - "Use to_datetime() to parse date strings.") + raise NotImplementedError( + "Timestamp.strptime() is not implemented. " + "Use to_datetime() to parse date strings." + ) @classmethod def combine(cls, date, time): @@ -381,8 +382,9 @@ class Timestamp(_Timestamp): if tzinfo is not None: if not PyTZInfo_Check(tzinfo): # tzinfo must be a datetime.tzinfo object, GH#17690 - raise TypeError(f'tzinfo must be a datetime.tzinfo object, ' - f'not {type(tzinfo)}') + raise TypeError( + f"tzinfo must be a datetime.tzinfo object, not {type(tzinfo)}" + ) elif tz is not None: raise ValueError('Can provide at most one of tz, tzinfo') @@ -393,8 +395,10 @@ class Timestamp(_Timestamp): # User passed a date string to parse. # Check that the user didn't also pass a date attribute kwarg. if any(arg is not None for arg in _date_attributes): - raise ValueError('Cannot pass a date attribute keyword ' - 'argument when passing a date string') + raise ValueError( + "Cannot pass a date attribute keyword " + "argument when passing a date string" + ) elif ts_input is _no_input: # User passed keyword arguments. @@ -578,8 +582,10 @@ timedelta}, default 'raise' @tz.setter def tz(self, value): # GH 3746: Prevent localizing or converting the index by setting tz - raise AttributeError("Cannot directly set timezone. Use tz_localize() " - "or tz_convert() as appropriate") + raise AttributeError( + "Cannot directly set timezone. " + "Use tz_localize() or tz_convert() as appropriate" + ) def __setstate__(self, state): self.value = state[0] @@ -598,9 +604,10 @@ timedelta}, default 'raise' if self.tz is not None: # GH#21333 - warnings.warn("Converting to Period representation will " - "drop timezone information.", - UserWarning) + warnings.warn( + "Converting to Period representation will drop timezone information.", + UserWarning, + ) if freq is None: freq = self.freq @@ -810,13 +817,13 @@ default 'raise' if ambiguous == 'infer': raise ValueError('Cannot infer offset with only one time.') - nonexistent_options = ('raise', 'NaT', 'shift_forward', - 'shift_backward') + nonexistent_options = ('raise', 'NaT', 'shift_forward', 'shift_backward') if nonexistent not in nonexistent_options and not isinstance( nonexistent, timedelta): - raise ValueError("The nonexistent argument must be one of 'raise', " - "'NaT', 'shift_forward', 'shift_backward' or " - "a timedelta object") + raise ValueError( + "The nonexistent argument must be one of 'raise', " + "'NaT', 'shift_forward', 'shift_backward' or a timedelta object" + ) if self.tzinfo is None: # tz naive, localize @@ -833,8 +840,9 @@ default 'raise' value = tz_convert_single(self.value, UTC, self.tz) return Timestamp(value, tz=tz, freq=self.freq) else: - raise TypeError('Cannot localize tz-aware Timestamp, use ' - 'tz_convert for conversions') + raise TypeError( + "Cannot localize tz-aware Timestamp, use tz_convert for conversions" + ) def tz_convert(self, tz): """ @@ -857,17 +865,28 @@ default 'raise' """ if self.tzinfo is None: # tz naive, use tz_localize - raise TypeError('Cannot convert tz-naive Timestamp, use ' - 'tz_localize to localize') + raise TypeError( + "Cannot convert tz-naive Timestamp, use tz_localize to localize" + ) else: # Same UTC timestamp, different time zone return Timestamp(self.value, tz=tz, freq=self.freq) astimezone = tz_convert - def replace(self, year=None, month=None, day=None, - hour=None, minute=None, second=None, microsecond=None, - nanosecond=None, tzinfo=object, fold=0): + def replace( + self, + year=None, + month=None, + day=None, + hour=None, + minute=None, + second=None, + microsecond=None, + nanosecond=None, + tzinfo=object, + fold=0, + ): """ implements datetime.replace, handles nanoseconds. @@ -910,8 +929,9 @@ default 'raise' def validate(k, v): """ validate integers """ if not is_integer_object(v): - raise ValueError(f"value must be an integer, received " - f"{type(v)} for {k}") + raise ValueError( + f"value must be an integer, received {type(v)} for {k}" + ) return v if year is not None: diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 0348843abc129..fe74d701ef00f 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1871,8 +1871,7 @@ def ewmcov(float64_t[:] input_x, float64_t[:] input_y, bint is_observation if len(input_y) != N: - raise ValueError(f"arrays are of different lengths " - f"({N} and {len(input_y)})") + raise ValueError(f"arrays are of different lengths ({N} and {len(input_y)})") output = np.empty(N, dtype=float) if N == 0: From dd6e31aa41056cfb4724eafeefa36f9587e8a763 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 13 Jan 2020 09:22:03 -0600 Subject: [PATCH 023/158] REGR: Fixed hash_key=None for object values (#30900) * REGR: Fixed hash_key=None for object values Closes https://github.com/pandas-dev/pandas/issues/30887 --- pandas/core/util/hashing.py | 6 +++++- pandas/tests/util/test_hashing.py | 7 +++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 43655fa3ea913..3366f10b92604 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -2,6 +2,7 @@ data hash pandas / numpy objects """ import itertools +from typing import Optional import numpy as np @@ -58,7 +59,7 @@ def hash_pandas_object( obj, index: bool = True, encoding: str = "utf8", - hash_key: str = _default_hash_key, + hash_key: Optional[str] = _default_hash_key, categorize: bool = True, ): """ @@ -82,6 +83,9 @@ def hash_pandas_object( """ from pandas import Series + if hash_key is None: + hash_key = _default_hash_key + if isinstance(obj, ABCMultiIndex): return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False) diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index c915edad4bb8e..c856585f20138 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -374,3 +374,10 @@ def test_hash_with_tuple(): df3 = pd.DataFrame({"data": [tuple([1, []]), tuple([2, {}])]}) with pytest.raises(TypeError, match="unhashable type: 'list'"): hash_pandas_object(df3) + + +def test_hash_object_none_key(): + # https://github.com/pandas-dev/pandas/issues/30887 + result = pd.util.hash_pandas_object(pd.Series(["a", "b"]), hash_key=None) + expected = pd.Series([4578374827886788867, 17338122309987883691], dtype="uint64") + tm.assert_series_equal(result, expected) From bd63eceb80333b7160c2be08163e718804930d57 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 13 Jan 2020 09:22:28 -0800 Subject: [PATCH 024/158] CLN: remove no-op from indexing (#30934) --- pandas/core/indexing.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ea59a6a49e649..cc11879142ffe 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1340,9 +1340,6 @@ def _multi_take(self, tup: Tuple): } return o._reindex_with_indexers(d, copy=True, allow_dups=True) - def _convert_for_reindex(self, key, axis: int): - return key - def _handle_lowerdim_multi_index_axis0(self, tup: Tuple): # we have an axis0 multi-index, handle or raise axis = self.axis or 0 @@ -1539,10 +1536,6 @@ def _get_listlike_indexer(self, key, axis: int, raise_missing: bool = False): return ax[indexer], indexer if ax.is_unique and not getattr(ax, "is_overlapping", False): - # If we are trying to get actual keys from empty Series, we - # patiently wait for a KeyError later on - otherwise, convert - if len(ax) or not len(key): - key = self._convert_for_reindex(key, axis) indexer = ax.get_indexer_for(key) keyarr = ax.reindex(keyarr)[0] else: @@ -1757,6 +1750,7 @@ def __getitem__(self, key): try: return self._getitem_scalar(key) except (KeyError, IndexError, AttributeError): + # AttributeError for IntervalTree get_value pass return self._getitem_tuple(key) else: From 7ba53f0e0e61b0e542bf553707a1df7cf8cfa83e Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Mon, 13 Jan 2020 19:03:25 +0000 Subject: [PATCH 025/158] BUG: -1 to the power of pd.NA was returning -1 (#30960) --- doc/source/user_guide/missing_data.rst | 1 - pandas/_libs/missing.pyx | 4 ++-- pandas/tests/arrays/test_integer.py | 12 ++++++----- pandas/tests/scalar/test_na_scalar.py | 29 ++++++++++++++------------ 4 files changed, 25 insertions(+), 21 deletions(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index abbb6feef6056..df9949e8ac261 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -831,7 +831,6 @@ Operation Result ================ ====== ``pd.NA ** 0`` 0 ``1 ** pd.NA`` 1 -``-1 ** pd.NA`` -1 ================ ====== In equality and comparison operations, ``pd.NA`` also propagates. This deviates diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 26653438356b1..4d17a6f883c1c 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -417,12 +417,12 @@ class NAType(C_NAType): if other is C_NA: return NA elif isinstance(other, (numbers.Number, np.bool_)): - if other == 1 or other == -1: + if other == 1: return other else: return NA elif isinstance(other, np.ndarray): - return np.where((other == 1) | (other == -1), other, NA) + return np.where(other == 1, other, NA) return NotImplemented diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 0c8980c43c370..f1a7cc741603d 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -363,24 +363,26 @@ def test_divide_by_zero(self, zero, negative): tm.assert_numpy_array_equal(result, expected) def test_pow_scalar(self): - a = pd.array([0, 1, None, 2], dtype="Int64") + a = pd.array([-1, 0, 1, None, 2], dtype="Int64") result = a ** 0 - expected = pd.array([1, 1, 1, 1], dtype="Int64") + expected = pd.array([1, 1, 1, 1, 1], dtype="Int64") tm.assert_extension_array_equal(result, expected) result = a ** 1 - expected = pd.array([0, 1, None, 2], dtype="Int64") + expected = pd.array([-1, 0, 1, None, 2], dtype="Int64") tm.assert_extension_array_equal(result, expected) result = a ** pd.NA - expected = pd.array([None, 1, None, None], dtype="Int64") + expected = pd.array([None, None, 1, None, None], dtype="Int64") tm.assert_extension_array_equal(result, expected) result = a ** np.nan - expected = np.array([np.nan, 1, np.nan, np.nan], dtype="float64") + expected = np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64") tm.assert_numpy_array_equal(result, expected) # reversed + a = a[1:] # Can't raise integers to negative powers. + result = 0 ** a expected = pd.array([1, 0, None, 0], dtype="Int64") tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index 7d05511239ebc..dcb9d66708724 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -96,19 +96,7 @@ def test_pow_special(value, asarray): @pytest.mark.parametrize( - "value", - [ - 1, - 1.0, - -1, - -1.0, - True, - np.bool_(True), - np.int_(1), - np.float_(1), - np.int_(-1), - np.float_(-1), - ], + "value", [1, 1.0, True, np.bool_(True), np.int_(1), np.float_(1)], ) @pytest.mark.parametrize("asarray", [True, False]) def test_rpow_special(value, asarray): @@ -125,6 +113,21 @@ def test_rpow_special(value, asarray): assert result == value +@pytest.mark.parametrize( + "value", [-1, -1.0, np.int_(-1), np.float_(-1)], +) +@pytest.mark.parametrize("asarray", [True, False]) +def test_rpow_minus_one(value, asarray): + if asarray: + value = np.array([value]) + result = value ** pd.NA + + if asarray: + result = result[0] + + assert pd.isna(result) + + def test_unary_ops(): assert +NA is NA assert -NA is NA From 67fcdefbd42921faa116558d5cf93635dd6fb1fc Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 13 Jan 2020 19:28:23 +0000 Subject: [PATCH 026/158] TYP: NDFrame.resample (#30947) --- pandas/core/frame.py | 7 ++++--- pandas/core/generic.py | 10 +++++++--- pandas/core/resample.py | 4 ++-- pandas/core/series.py | 7 ++++--- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 676b78573399c..594b8a00a8672 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -102,7 +102,6 @@ from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.generic import NDFrame, _shared_docs -from pandas.core.groupby import generic as groupby_generic from pandas.core.indexes import base as ibase from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences from pandas.core.indexes.datetimes import DatetimeIndex @@ -129,6 +128,7 @@ import pandas.plotting if TYPE_CHECKING: + from pandas.core.groupby.generic import DataFrameGroupBy from pandas.io.formats.style import Styler # --------------------------------------------------------------------- @@ -5777,13 +5777,14 @@ def groupby( group_keys: bool = True, squeeze: bool = False, observed: bool = False, - ) -> "groupby_generic.DataFrameGroupBy": + ) -> "DataFrameGroupBy": + from pandas.core.groupby.generic import DataFrameGroupBy if level is None and by is None: raise TypeError("You have to supply one of 'by' and 'level'") axis = self._get_axis_number(axis) - return groupby_generic.DataFrameGroupBy( + return DataFrameGroupBy( obj=self, keys=by, axis=axis, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 04ce424edbee4..05066ac0ec128 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8,6 +8,7 @@ import re from textwrap import dedent from typing import ( + TYPE_CHECKING, Any, Callable, Dict, @@ -101,6 +102,9 @@ from pandas.io.formats.printing import pprint_thing from pandas.tseries.frequencies import to_offset +if TYPE_CHECKING: + from pandas.core.resample import Resampler + # goal is to be able to define the docs close to function, while still being # able to share _shared_docs: Dict[str, str] = dict() @@ -7685,7 +7689,7 @@ def resample( base: int = 0, on=None, level=None, - ): + ) -> "Resampler": """ Resample time-series data. @@ -7950,10 +7954,10 @@ def resample( 2000-01-04 36 90 """ - from pandas.core.resample import resample + from pandas.core.resample import get_resampler axis = self._get_axis_number(axis) - return resample( + return get_resampler( self, freq=rule, label=label, diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 0e43880dfda07..fb837409a00f5 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1262,7 +1262,7 @@ def _constructor(self): return TimedeltaIndexResampler -def resample(obj, kind=None, **kwds): +def get_resampler(obj, kind=None, **kwds): """ Create a TimeGrouper and return our resampler. """ @@ -1270,7 +1270,7 @@ def resample(obj, kind=None, **kwds): return tg._get_resampler(obj, kind=kind) -resample.__doc__ = Resampler.__doc__ +get_resampler.__doc__ = Resampler.__doc__ def get_resampler_for_grouping( diff --git a/pandas/core/series.py b/pandas/core/series.py index ed338700f1011..fe5c5fd5e2bc8 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -70,7 +70,6 @@ is_empty_data, sanitize_array, ) -from pandas.core.groupby import generic as groupby_generic from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.accessors import CombinedDatetimelikeProperties from pandas.core.indexes.api import ( @@ -94,6 +93,7 @@ if TYPE_CHECKING: from pandas.core.frame import DataFrame + from pandas.core.groupby.generic import SeriesGroupBy __all__ = ["Series"] @@ -1634,13 +1634,14 @@ def groupby( group_keys: bool = True, squeeze: bool = False, observed: bool = False, - ) -> "groupby_generic.SeriesGroupBy": + ) -> "SeriesGroupBy": + from pandas.core.groupby.generic import SeriesGroupBy if level is None and by is None: raise TypeError("You have to supply one of 'by' and 'level'") axis = self._get_axis_number(axis) - return groupby_generic.SeriesGroupBy( + return SeriesGroupBy( obj=self, keys=by, axis=axis, From 993fdbebbcf96e31d6de7cf40b297a50ddfcaf7c Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 13 Jan 2020 11:50:08 -0800 Subject: [PATCH 027/158] DOC: whatsnew for 1.1 (#30972) --- doc/source/index.rst.template | 4 +- doc/source/whatsnew/index.rst | 8 ++ doc/source/whatsnew/v1.1.0.rst | 168 +++++++++++++++++++++++++++++++++ 3 files changed, 178 insertions(+), 2 deletions(-) create mode 100644 doc/source/whatsnew/v1.1.0.rst diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index 10705787dfedf..4ced92cbda81a 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -39,7 +39,7 @@ See the :ref:`overview` for more detail about what's in the library. :hidden: {% endif %} {% if not single_doc %} - What's New in 1.0.0 + What's New in 1.1.0 getting_started/index user_guide/index {% endif -%} @@ -51,7 +51,7 @@ See the :ref:`overview` for more detail about what's in the library. whatsnew/index {% endif %} -* :doc:`whatsnew/v1.0.0` +* :doc:`whatsnew/v1.1.0` * :doc:`getting_started/index` * :doc:`getting_started/install` diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 05c7f72882088..bc463d0ab22d8 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -10,6 +10,14 @@ This is the list of changes to pandas between each release. For full details, see the commit logs at http://github.com/pandas-dev/pandas. For install and upgrade instructions, see :ref:`install`. +Version 1.1 +----------- + +.. toctree:: + :maxdepth: 2 + + v1.1.0 + Version 1.0 ----------- diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst new file mode 100644 index 0000000000000..c3ee72f6442fc --- /dev/null +++ b/doc/source/whatsnew/v1.1.0.rst @@ -0,0 +1,168 @@ +.. _whatsnew_110: + +What's new in 1.1.0 (??) +------------------------ + +These are the changes in pandas 1.1.0. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +Enhancements +~~~~~~~~~~~~ + +.. _whatsnew_110.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ + +- +- + + +.. --------------------------------------------------------------------------- + +.. _whatsnew_110.deprecations: + +Deprecations +~~~~~~~~~~~~ + +- +- + +.. --------------------------------------------------------------------------- + + +.. _whatsnew_110.performance: + +Performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_110.bug_fixes: + +Bug fixes +~~~~~~~~~ + + +Categorical +^^^^^^^^^^^ + +- +- + +Datetimelike +^^^^^^^^^^^^ +- +- + +Timedelta +^^^^^^^^^ + +- +- + +Timezones +^^^^^^^^^ + +- +- + + +Numeric +^^^^^^^ +- +- + +Conversion +^^^^^^^^^^ + +- +- + +Strings +^^^^^^^ + +- +- + + +Interval +^^^^^^^^ + +- +- + +Indexing +^^^^^^^^ + +- +- + +Missing +^^^^^^^ + +- +- + +MultiIndex +^^^^^^^^^^ + +- +- + +I/O +^^^ + +- +- + +Plotting +^^^^^^^^ + +- +- + +Groupby/resample/rolling +^^^^^^^^^^^^^^^^^^^^^^^^ + +- +- + + +Reshaping +^^^^^^^^^ + +- +- + +Sparse +^^^^^^ + +- +- + +ExtensionArray +^^^^^^^^^^^^^^ + +- +- + + +Other +^^^^^ +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_110.contributors: + +Contributors +~~~~~~~~~~~~ From 69283277ecf220cb9715d2460b3e630e31e0e686 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 13 Jan 2020 12:25:58 -0800 Subject: [PATCH 028/158] CLN: misc cleanups (#30877) --- pandas/_libs/index.pyx | 2 -- pandas/core/indexes/datetimelike.py | 14 ++++---------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 28d269a9a809e..ce6d12d61c521 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -85,7 +85,6 @@ cdef class IndexEngine: """ cdef: object loc - void* data_ptr loc = self.get_loc(key) if isinstance(loc, slice) or util.is_array(loc): @@ -101,7 +100,6 @@ cdef class IndexEngine: """ cdef: object loc - void* data_ptr loc = self.get_loc(key) value = convert_scalar(arr, value) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 9eb5ed7cb0911..bf1272b223f70 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -156,13 +156,11 @@ def equals(self, other) -> bool: def __contains__(self, key): try: res = self.get_loc(key) - return ( - is_scalar(res) - or isinstance(res, slice) - or (is_list_like(res) and len(res)) - ) except (KeyError, TypeError, ValueError): return False + return bool( + is_scalar(res) or isinstance(res, slice) or (is_list_like(res) and len(res)) + ) # Try to run function on index first, and then on elements of index # Especially important for group-by functionality @@ -875,11 +873,7 @@ def _is_convertible_to_index_for_join(cls, other: Index) -> bool: def _wrap_joined_index(self, joined, other): name = get_op_result_name(self, other) - if ( - isinstance(other, type(self)) - and self.freq == other.freq - and self._can_fast_union(other) - ): + if self._can_fast_union(other): joined = self._shallow_copy(joined) joined.name = name return joined From 2bf0c9fde7dacde096729cd241902f8571f9c024 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 13 Jan 2020 14:45:42 -0600 Subject: [PATCH 029/158] Compat for util.testing import (#30973) * Compat for util.testing import Closes #30869 --- pandas/tests/api/test_api.py | 18 ++++++++++++++++++ pandas/util/__init__.py | 27 +++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 8b897524cb053..406d5f055797d 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -1,6 +1,9 @@ +import subprocess import sys from typing import List +import pytest + import pandas as pd from pandas import api, compat import pandas._testing as tm @@ -311,3 +314,18 @@ def test_util_testing_deprecated_direct(self): assert "pandas.util.testing is deprecated" in str(m[0].message) assert "pandas.testing instead" in str(m[0].message) + + def test_util_in_top_level(self): + # in a subprocess to avoid import caching issues + out = subprocess.check_output( + [ + sys.executable, + "-c", + "import pandas; pandas.util.testing.assert_series_equal", + ], + stderr=subprocess.STDOUT, + ).decode() + assert "pandas.util.testing is deprecated" in out + + with pytest.raises(AttributeError, match="foo"): + pd.util.foo diff --git a/pandas/util/__init__.py b/pandas/util/__init__.py index d906c0371d207..b5271dbc0443e 100644 --- a/pandas/util/__init__.py +++ b/pandas/util/__init__.py @@ -1,3 +1,30 @@ from pandas.util._decorators import Appender, Substitution, cache_readonly # noqa +from pandas import compat from pandas.core.util.hashing import hash_array, hash_pandas_object # noqa + +# compatibility for import pandas; pandas.util.testing + +if compat.PY37: + + def __getattr__(name): + if name == "testing": + import pandas.util.testing + + return pandas.util.testing + else: + raise AttributeError(f"module 'pandas.util' has no attribute '{name}'") + + +else: + + class _testing: + def __getattr__(self, item): + import pandas.util.testing + + return getattr(pandas.util.testing, item) + + testing = _testing() + + +del compat From 307137ce95869fe636d99d38059827e8c063d430 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Tue, 14 Jan 2020 00:01:38 +0200 Subject: [PATCH 030/158] STY: concat strings (#30979) --- pandas/__init__.py | 9 +++---- pandas/_config/config.py | 3 +-- pandas/core/ops/__init__.py | 3 +-- pandas/core/reshape/melt.py | 6 ++--- pandas/core/reshape/merge.py | 34 ++++++++----------------- pandas/core/reshape/tile.py | 3 +-- pandas/core/tools/datetimes.py | 7 ++--- pandas/core/window/common.py | 3 +-- pandas/core/window/rolling.py | 11 +++----- pandas/io/excel/_util.py | 3 +-- pandas/io/formats/format.py | 8 ++---- pandas/io/formats/html.py | 4 +-- pandas/io/formats/latex.py | 3 +-- pandas/io/sas/sas.pyx | 5 ++-- pandas/io/sas/sas7bdat.py | 6 ++--- pandas/io/sas/sasreader.py | 3 +-- scripts/generate_pip_deps_from_conda.py | 3 +-- 17 files changed, 39 insertions(+), 75 deletions(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index 491bcb21f245d..d526531b159b2 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -35,8 +35,7 @@ raise ImportError( f"C extension: {module} not built. If you want to import " "pandas from the source directory, you may need to run " - "'python setup.py build_ext --inplace --force' to build " - "the C extensions first." + "'python setup.py build_ext --inplace --force' to build the C extensions first." ) from pandas._config import ( @@ -198,8 +197,7 @@ def __getattr__(name): warnings.warn( "The Panel class is removed from pandas. Accessing it " - "from the top-level namespace will also be removed in " - "the next version", + "from the top-level namespace will also be removed in the next version", FutureWarning, stacklevel=2, ) @@ -238,8 +236,7 @@ class Panel: elif name in {"SparseSeries", "SparseDataFrame"}: warnings.warn( f"The {name} class is removed from pandas. Accessing it from " - "the top-level namespace will also be removed in the next " - "version", + "the top-level namespace will also be removed in the next version", FutureWarning, stacklevel=2, ) diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 0a3009f74492f..42df8a84a8c77 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -165,8 +165,7 @@ def _reset_option(pat, silent=False): raise ValueError( "You must specify at least 4 characters when " "resetting multiple keys, use the special keyword " - '"all" to reset all the options to their default ' - "value" + '"all" to reset all the options to their default value' ) for k in keys: diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index f51d71d5507a0..1355060efd097 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -664,8 +664,7 @@ def to_series(right): elif right.ndim > 2: raise ValueError( - "Unable to coerce to Series/DataFrame, dim " - f"must be <= 2: {right.shape}" + f"Unable to coerce to Series/DataFrame, dim must be <= 2: {right.shape}" ) elif is_list_like(right) and not isinstance(right, (ABCSeries, ABCDataFrame)): diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index d4ccb19fc0dda..d04287e1e9088 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -52,8 +52,7 @@ def melt( if not missing.empty: raise KeyError( "The following 'id_vars' are not present " - "in the DataFrame: {missing}" - "".format(missing=list(missing)) + f"in the DataFrame: {list(missing)}" ) else: id_vars = [] @@ -74,8 +73,7 @@ def melt( if not missing.empty: raise KeyError( "The following 'value_vars' are not present in " - "the DataFrame: {missing}" - "".format(missing=list(missing)) + f"the DataFrame: {list(missing)}" ) frame = frame.loc[:, id_vars + value_vars] else: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 5f92e4a88b568..acb53ff6ca555 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -600,13 +600,11 @@ def __init__( if not is_bool(left_index): raise ValueError( - "left_index parameter must be of type bool, not " - "{left_index}".format(left_index=type(left_index)) + f"left_index parameter must be of type bool, not {type(left_index)}" ) if not is_bool(right_index): raise ValueError( - "right_index parameter must be of type bool, not " - "{right_index}".format(right_index=type(right_index)) + f"right_index parameter must be of type bool, not {type(right_index)}" ) # warn user when merging between different levels @@ -1092,8 +1090,7 @@ def _maybe_coerce_merge_keys(self): warnings.warn( "You are merging on int and float " "columns where the float values " - "are not equal to their int " - "representation", + "are not equal to their int representation", UserWarning, ) continue @@ -1103,8 +1100,7 @@ def _maybe_coerce_merge_keys(self): warnings.warn( "You are merging on int and float " "columns where the float values " - "are not equal to their int " - "representation", + "are not equal to their int representation", UserWarning, ) continue @@ -1251,20 +1247,17 @@ def _validate(self, validate: str): ) elif not left_unique: raise MergeError( - "Merge keys are not unique in left dataset; " - "not a one-to-one merge" + "Merge keys are not unique in left dataset; not a one-to-one merge" ) elif not right_unique: raise MergeError( - "Merge keys are not unique in right dataset; " - "not a one-to-one merge" + "Merge keys are not unique in right dataset; not a one-to-one merge" ) elif validate in ["one_to_many", "1:m"]: if not left_unique: raise MergeError( - "Merge keys are not unique in left dataset; " - "not a one-to-many merge" + "Merge keys are not unique in left dataset; not a one-to-many merge" ) elif validate in ["many_to_one", "m:1"]: @@ -1833,8 +1826,7 @@ def _left_join_on_index(left_ax: Index, right_ax: Index, join_keys, sort: bool = raise AssertionError( "If more than one join key is given then " "'right_ax' must be a MultiIndex and the " - "number of join keys must be the number of " - "levels in right_ax" + "number of join keys must be the number of levels in right_ax" ) left_indexer, right_indexer = _get_multiindex_indexer( @@ -2004,8 +1996,7 @@ def _validate_operand(obj: FrameOrSeries) -> "DataFrame": return obj.to_frame() else: raise TypeError( - "Can only merge Series or DataFrame objects, " - "a {obj} was passed".format(obj=type(obj)) + f"Can only merge Series or DataFrame objects, a {type(obj)} was passed" ) @@ -2021,10 +2012,7 @@ def _items_overlap_with_suffix(left: Index, lsuffix, right: Index, rsuffix): return left, right if not lsuffix and not rsuffix: - raise ValueError( - "columns overlap but no suffix specified: " - "{rename}".format(rename=to_rename) - ) + raise ValueError(f"columns overlap but no suffix specified: {to_rename}") def renamer(x, suffix): """ @@ -2043,7 +2031,7 @@ def renamer(x, suffix): x : renamed column name """ if x in to_rename and suffix is not None: - return "{x}{suffix}".format(x=x, suffix=suffix) + return f"{x}{suffix}" return x lrenamer = partial(renamer, suffix=lsuffix) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 2e3eb9170b15c..5a444d908b786 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -363,8 +363,7 @@ def _bins_to_cuts( if duplicates not in ["raise", "drop"]: raise ValueError( - "invalid value for 'duplicates' parameter, " - "valid options are: raise, drop" + "invalid value for 'duplicates' parameter, valid options are: raise, drop" ) if isinstance(bins, IntervalIndex): diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index cfa42d764ee44..898fbc6f8bc3b 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -231,9 +231,7 @@ def _return_parsed_timezone_results(result, timezones, tz, name): """ if tz is not None: raise ValueError( - "Cannot pass a tz argument when " - "parsing strings with timezone " - "information." + "Cannot pass a tz argument when parsing strings with timezone information." ) tz_results = np.array( [Timestamp(res).tz_localize(zone) for res, zone in zip(result, timezones)] @@ -817,8 +815,7 @@ def f(value): required = ",".join(req) raise ValueError( "to assemble mappings requires at least that " - f"[year, month, day] be specified: [{required}] " - "is missing" + f"[year, month, day] be specified: [{required}] is missing" ) # keys we don't recognize diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 64ec0e68e11b0..ed0b816f64800 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -98,8 +98,7 @@ def _flex_binary_moment(arg1, arg2, f, pairwise=False): and isinstance(arg2, (np.ndarray, ABCSeries, ABCDataFrame)) ): raise TypeError( - "arguments to moment function must be of type " - "np.ndarray/Series/DataFrame" + "arguments to moment function must be of type np.ndarray/Series/DataFrame" ) if isinstance(arg1, (np.ndarray, ABCSeries)) and isinstance( diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index f612826132fd7..bdc94c7402eb5 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1820,8 +1820,7 @@ def _on(self) -> Index: else: raise ValueError( f"invalid on specified as {self.on}, " - "must be a column (of DataFrame), an Index " - "or None" + "must be a column (of DataFrame), an Index or None" ) def validate(self): @@ -1838,9 +1837,8 @@ def validate(self): # we don't allow center if self.center: raise NotImplementedError( - "center is not implemented " - "for datetimelike and offset " - "based windows" + "center is not implemented for " + "datetimelike and offset based windows" ) # this will raise ValueError on non-fixed freqs @@ -1886,8 +1884,7 @@ def _validate_freq(self): except (TypeError, ValueError): raise ValueError( f"passed window {self.window} is not " - "compatible with a datetimelike " - "index" + "compatible with a datetimelike index" ) _agg_see_also_doc = dedent( diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index a084be54dfa10..9d284c8031840 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -136,8 +136,7 @@ def _maybe_convert_usecols(usecols): if is_integer(usecols): raise ValueError( "Passing an integer for `usecols` is no longer supported. " - "Please pass in a list of int from 0 to `usecols` " - "inclusive instead." + "Please pass in a list of int from 0 to `usecols` inclusive instead." ) if isinstance(usecols, str): diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 6adf69a922000..296b305f41dd2 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -737,12 +737,8 @@ def _to_str_columns(self) -> List[List[str]]: self.header = cast(List[str], self.header) if len(self.header) != len(self.columns): raise ValueError( - ( - "Writing {ncols} cols but got {nalias} " - "aliases".format( - ncols=len(self.columns), nalias=len(self.header) - ) - ) + f"Writing {len(self.columns)} cols " + f"but got {len(self.header)} aliases" ) str_columns = [[label] for label in self.header] else: diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index b46b2f6c671d6..e3161415fe2bc 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -216,8 +216,8 @@ def _write_table(self, indent: int = 0) -> None: self.classes = self.classes.split() if not isinstance(self.classes, (list, tuple)): raise TypeError( - "classes must be a string, list, or tuple, " - "not {typ}".format(typ=type(self.classes)) + "classes must be a string, list, " + f"or tuple, not {type(self.classes)}" ) _classes.extend(self.classes) diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index 008a99427f3c7..8ab56437d5c05 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -114,8 +114,7 @@ def pad_empties(x): column_format = index_format + column_format elif not isinstance(self.column_format, str): # pragma: no cover raise AssertionError( - "column_format must be str or unicode, " - "not {typ}".format(typ=type(column_format)) + f"column_format must be str or unicode, not {type(column_format)}" ) else: column_format = self.column_format diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index bb5bce96bc64b..211935009d2e5 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -267,8 +267,9 @@ cdef class Parser: elif column_types[j] == b's': self.column_types[j] = column_type_string else: - raise ValueError("unknown column type: " - f"{self.parser.columns[j].ctype}") + raise ValueError( + f"unknown column type: {self.parser.columns[j].ctype}" + ) # compression if parser.compression == const.rle_compression: diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index f917477b81489..9b40778dbcfdf 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -459,8 +459,7 @@ def _process_columnsize_subheader(self, offset, length): if self.col_count_p1 + self.col_count_p2 != self.column_count: print( f"Warning: column count mismatch ({self.col_count_p1} + " - f"{self.col_count_p2} != " - f"{self.column_count})\n" + f"{self.col_count_p2} != {self.column_count})\n" ) # Unknown purpose @@ -672,8 +671,7 @@ def _read_next_page(self): self.close() msg = ( "failed to read complete page from file (read " - f"{len(self._cached_page):d} of " - f"{self._page_length:d} bytes)" + f"{len(self._cached_page):d} of {self._page_length:d} bytes)" ) raise ValueError(msg) diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 56ebb583bc2f9..27d56d4ede403 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -49,8 +49,7 @@ def read_sas( if format is None: buffer_error_msg = ( "If this is a buffer object rather " - "than a string name, you must specify " - "a format string" + "than a string name, you must specify a format string" ) filepath_or_buffer = stringify_path(filepath_or_buffer) if not isinstance(filepath_or_buffer, str): diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index 53a27e8782ad7..9e0ec4df02edf 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -132,8 +132,7 @@ def main(conda_fname, pip_fname, compare=False): ) if args.azure: msg = ( - "##vso[task.logissue type=error;" - f"sourcepath=requirements-dev.txt]{msg}" + f"##vso[task.logissue type=error;sourcepath=requirements-dev.txt]{msg}" ) sys.stderr.write(msg) sys.exit(res) From 20755396868c43f4ed0df78a63e5bd8825129a22 Mon Sep 17 00:00:00 2001 From: Christian Chwala Date: Mon, 13 Jan 2020 23:30:24 +0100 Subject: [PATCH 031/158] Added small corrections to the test for interpolate limit_area (#30987) --- pandas/tests/series/test_missing.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 7b6d9210ed3d9..d8eeefcbdce7b 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1347,6 +1347,7 @@ def test_interp_limit_area(self): [np.nan, np.nan, 3.0, 4.0, np.nan, np.nan, 7.0, np.nan, np.nan] ) result = s.interpolate(method="linear", limit_area="inside", limit=1) + tm.assert_series_equal(result, expected) expected = Series([np.nan, np.nan, 3.0, 4.0, np.nan, 6.0, 7.0, np.nan, np.nan]) result = s.interpolate( @@ -1362,6 +1363,7 @@ def test_interp_limit_area(self): [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan] ) result = s.interpolate(method="linear", limit_area="outside", limit=1) + tm.assert_series_equal(result, expected) expected = Series([np.nan, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan]) result = s.interpolate( @@ -1371,8 +1373,9 @@ def test_interp_limit_area(self): expected = Series([3.0, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan]) result = s.interpolate( - method="linear", limit_area="outside", direction="backward" + method="linear", limit_area="outside", limit_direction="backward" ) + tm.assert_series_equal(result, expected) # raises an error even if limit type is wrong. msg = r"Invalid limit_area: expecting one of \['inside', 'outside'\], got abc" From 8ff2ebd9b2cfbe4a3e1eb7893dc4343c01ad55b4 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Tue, 14 Jan 2020 02:47:47 +0200 Subject: [PATCH 032/158] STY: concat strings (#30991) --- pandas/compat/__init__.py | 3 +-- pandas/compat/numpy/__init__.py | 6 ++---- pandas/compat/numpy/function.py | 5 ++--- pandas/core/generic.py | 3 +-- pandas/core/reshape/concat.py | 11 +++------- pandas/core/reshape/merge.py | 5 ++--- pandas/io/clipboards.py | 3 +-- pandas/io/common.py | 3 +-- pandas/io/date_converters.py | 3 +-- pandas/io/feather_format.py | 9 +++----- pandas/io/html.py | 3 +-- pandas/io/json/_normalize.py | 3 +-- pandas/io/parquet.py | 6 ++---- pandas/io/parsers.py | 26 ++++++++---------------- pandas/io/pytables.py | 19 +++++++---------- pandas/io/sql.py | 9 +++----- pandas/io/stata.py | 8 ++++---- pandas/plotting/_matplotlib/converter.py | 3 +-- pandas/plotting/_matplotlib/hist.py | 3 +-- pandas/plotting/_matplotlib/tools.py | 3 +-- 20 files changed, 47 insertions(+), 87 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 60cfecd5804ac..3547a33ea357b 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -110,8 +110,7 @@ def _import_lzma(): return lzma except ImportError: msg = ( - "Could not import the lzma module. " - "Your installed Python is incomplete. " + "Could not import the lzma module. Your installed Python is incomplete. " "Attempting to use lzma compression will result in a RuntimeError." ) warnings.warn(msg) diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 27f1c32058941..6c9ac5944e6a1 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -18,11 +18,9 @@ if _nlv < "1.13.3": raise ImportError( - f"this version of pandas is incompatible with " - f"numpy < 1.13.3\n" + "this version of pandas is incompatible with numpy < 1.13.3\n" f"your numpy version is {_np_version}.\n" - f"Please upgrade numpy to >= 1.13.3 to use " - f"this pandas version" + "Please upgrade numpy to >= 1.13.3 to use this pandas version" ) diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index 50f234cbf9419..05ecccc67daef 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -389,9 +389,8 @@ def validate_resampler_func(method: str, args, kwargs) -> None: if len(args) + len(kwargs) > 0: if method in RESAMPLER_NUMPY_OPS: raise UnsupportedFunctionCall( - f"numpy operations are not " - f"valid with resample. Use " - f".resample(...).{method}() instead" + "numpy operations are not valid with resample. " + f"Use .resample(...).{method}() instead" ) else: raise TypeError("too many arguments passed in") diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 05066ac0ec128..ada26b55a778a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1701,8 +1701,7 @@ def _get_label_or_level_values(self, key: str, axis: int = 0) -> np.ndarray: multi_message = ( "\n" "For a multi-index, the label must be a " - "tuple with elements corresponding to " - "each level." + "tuple with elements corresponding to each level." ) else: multi_message = "" diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 502b8d1941fdf..449f70b2be2fd 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -305,8 +305,7 @@ def __init__( if isinstance(objs, (NDFrame, str)): raise TypeError( "first argument must be an iterable of pandas " - "objects, you passed an object of type " - '"{name}"'.format(name=type(objs).__name__) + f'objects, you passed an object of type "{type(objs).__name__}"' ) if join == "outer": @@ -577,10 +576,7 @@ def _maybe_check_integrity(self, concat_index: Index): if self.verify_integrity: if not concat_index.is_unique: overlap = concat_index[concat_index.duplicated()].unique() - raise ValueError( - "Indexes have overlapping values: " - "{overlap!s}".format(overlap=overlap) - ) + raise ValueError(f"Indexes have overlapping values: {overlap}") def _concat_indexes(indexes) -> Index: @@ -648,8 +644,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde # make sure that all of the passed indices have the same nlevels if not len({idx.nlevels for idx in indexes}) == 1: raise AssertionError( - "Cannot concat indices that do " - "not have the same number of levels" + "Cannot concat indices that do not have the same number of levels" ) # also copies diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index acb53ff6ca555..ceee2f66dba42 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1071,9 +1071,8 @@ def _maybe_coerce_merge_keys(self): continue msg = ( - "You are trying to merge on {lk_dtype} and " - "{rk_dtype} columns. If you wish to proceed " - "you should use pd.concat".format(lk_dtype=lk.dtype, rk_dtype=rk.dtype) + f"You are trying to merge on {lk.dtype} and " + f"{rk.dtype} columns. If you wish to proceed you should use pd.concat" ) # if we are numeric, then allow differing diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index 34e8e03d8771e..97178261bdf72 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -69,8 +69,7 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover kwargs["engine"] = "python" elif len(sep) > 1 and kwargs.get("engine") == "c": warnings.warn( - "read_clipboard with regex separator does not work " - "properly with c engine" + "read_clipboard with regex separator does not work properly with c engine" ) return read_csv(StringIO(text), sep=sep, **kwargs) diff --git a/pandas/io/common.py b/pandas/io/common.py index 771a302d647ec..6a764ff252dea 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -78,8 +78,7 @@ def _expand_user( def validate_header_arg(header) -> None: if isinstance(header, bool): raise TypeError( - "Passing a bool to header is invalid. " - "Use header=None for no header or " + "Passing a bool to header is invalid. Use header=None for no header or " "header=int or list-like of ints to specify " "the row(s) making up the column names" ) diff --git a/pandas/io/date_converters.py b/pandas/io/date_converters.py index 7fdca2d65b05d..07919dbda63ae 100644 --- a/pandas/io/date_converters.py +++ b/pandas/io/date_converters.py @@ -57,8 +57,7 @@ def _check_columns(cols): for i, n in enumerate(map(len, tail)): if n != N: raise AssertionError( - f"All columns must have the same length: {N}; " - f"column {i} has length {n}" + f"All columns must have the same length: {N}; column {i} has length {n}" ) return N diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index eb05004d9137c..5d4925620e75f 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -37,16 +37,13 @@ def to_feather(df: DataFrame, path): typ = type(df.index) raise ValueError( f"feather does not support serializing {typ} " - "for the index; you can .reset_index() " - "to make the index into column(s)" + "for the index; you can .reset_index() to make the index into column(s)" ) if not df.index.equals(RangeIndex.from_range(range(len(df)))): raise ValueError( - "feather does not support serializing a " - "non-default index for the index; you " - "can .reset_index() to make the index " - "into column(s)" + "feather does not support serializing a non-default index for the index; " + "you can .reset_index() to make the index into column(s)" ) if df.index.name is not None: diff --git a/pandas/io/html.py b/pandas/io/html.py index eafcca0e85bb3..809ce77eef0bb 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -899,8 +899,7 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): f"The flavor {flav} failed to parse your input. " "Since you passed a non-rewindable file " "object, we can't rewind it to try " - "another parser. Try read_html() with a " - "different flavor." + "another parser. Try read_html() with a different flavor." ) retained = caught diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index c0596c984575a..cf292a13fed7f 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -317,8 +317,7 @@ def _recursive_extract(data, path, seen_meta, level=0): meta_val = np.nan else: raise KeyError( - "Try running with " - "errors='ignore' as key " + "Try running with errors='ignore' as key " f"{e} is not always present" ) meta_vals[key].append(meta_val) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 3a686a1a3b122..4be62b886f076 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -32,8 +32,7 @@ def get_engine(engine: str) -> "BaseImpl": raise ImportError( "Unable to find a usable engine; " "tried using: 'pyarrow', 'fastparquet'.\n" - "pyarrow or fastparquet is required for parquet " - "support" + "pyarrow or fastparquet is required for parquet support" ) if engine == "pyarrow": @@ -156,8 +155,7 @@ def write( if "partition_on" in kwargs and partition_cols is not None: raise ValueError( "Cannot use both partition_on and " - "partition_cols. Use partition_cols for " - "partitioning data" + "partition_cols. Use partition_cols for partitioning data" ) elif "partition_on" in kwargs: partition_cols = kwargs.pop("partition_on") diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index b4eb2fb1411d0..62b82f174e17c 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -612,8 +612,7 @@ def parser_f( if delim_whitespace and delimiter != default_sep: raise ValueError( "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only " - "specify one." + "delim_whitespace=True; you can only specify one." ) if engine is not None: @@ -968,8 +967,7 @@ def _clean_options(self, options, engine): fallback_reason = ( "the 'c' engine does not support " "regex separators (separators > 1 char and " - r"different from '\s+' are " - "interpreted as regex)" + r"different from '\s+' are interpreted as regex)" ) engine = "python" elif delim_whitespace: @@ -1000,8 +998,7 @@ def _clean_options(self, options, engine): fallback_reason = ( "ord(quotechar) > 127, meaning the " "quotechar is larger than one byte, " - "and the 'c' engine does not support " - "such quotechars" + "and the 'c' engine does not support such quotechars" ) engine = "python" @@ -1119,9 +1116,8 @@ def _make_engine(self, engine="c"): klass = FixedWidthFieldParser else: raise ValueError( - f"Unknown engine: {engine} (valid options are " - '"c", "python", or ' - '"python-fwf")' + f"Unknown engine: {engine} (valid options " + 'are "c", "python", or "python-fwf")' ) self._engine = klass(self.f, **self.options) @@ -1230,8 +1226,7 @@ def _validate_usecols_names(usecols, names): missing = [c for c in usecols if c not in names] if len(missing) > 0: raise ValueError( - "Usecols do not match columns, " - f"columns expected but not found: {missing}" + f"Usecols do not match columns, columns expected but not found: {missing}" ) return usecols @@ -1325,8 +1320,7 @@ def _validate_parse_dates_arg(parse_dates): that is the case. """ msg = ( - "Only booleans, lists, and " - "dictionaries are accepted " + "Only booleans, lists, and dictionaries are accepted " "for the 'parse_dates' parameter" ) @@ -1680,8 +1674,7 @@ def _convert_to_ndarrays( warnings.warn( ( "Both a converter and dtype were specified " - f"for column {c} - only the converter will " - "be used" + f"for column {c} - only the converter will be used" ), ParserWarning, stacklevel=7, @@ -1826,8 +1819,7 @@ def _cast_types(self, values, cast_type, column): except NotImplementedError: raise NotImplementedError( f"Extension Array: {array_type} must implement " - "_from_sequence_of_strings in order " - "to be used in parser methods" + "_from_sequence_of_strings in order to be used in parser methods" ) else: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index d61d1cf7f0257..9e8d8a2e89f20 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -413,8 +413,8 @@ def read_hdf( for group_to_check in groups[1:]: if not _is_metadata_of(group_to_check, candidate_only_group): raise ValueError( - "key must be provided when HDF5 file " - "contains multiple datasets." + "key must be provided when HDF5 " + "file contains multiple datasets." ) key = candidate_only_group._v_pathname return store.select( @@ -1240,8 +1240,7 @@ def append_to_multiple( if v is None: if remain_key is not None: raise ValueError( - "append_to_multiple can only have one value in d that " - "is None" + "append_to_multiple can only have one value in d that is None" ) remain_key = k else: @@ -2313,8 +2312,7 @@ def validate_attr(self, append): existing_dtype = getattr(self.attrs, self.dtype_attr, None) if existing_dtype is not None and existing_dtype != self.dtype: raise ValueError( - "appended items dtype do not match existing " - "items dtype in table!" + "appended items dtype do not match existing items dtype in table!" ) def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): @@ -2680,14 +2678,12 @@ def validate_read(self, columns, where): if columns is not None: raise TypeError( "cannot pass a column specification when reading " - "a Fixed format store. this store must be " - "selected in its entirety" + "a Fixed format store. this store must be selected in its entirety" ) if where is not None: raise TypeError( "cannot pass a where specification when reading " - "from a Fixed format store. this store must be " - "selected in its entirety" + "from a Fixed format store. this store must be selected in its entirety" ) @property @@ -2908,8 +2904,7 @@ def write_array(self, key: str, value: ArrayLike, items: Optional[Index] = None) if is_categorical_dtype(value): raise NotImplementedError( - "Cannot store a category dtype in " - "a HDF5 dataset that uses format=" + "Cannot store a category dtype in a HDF5 dataset that uses format=" '"fixed". Use format="table".' ) if not empty_array: diff --git a/pandas/io/sql.py b/pandas/io/sql.py index f4527994db0d2..58fed0d18dd4a 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -977,8 +977,7 @@ def _sqlalchemy_type(self, col): if col_type == "timedelta64": warnings.warn( "the 'timedelta' type is not supported, and will be " - "written as integer values (ns frequency) to the " - "database.", + "written as integer values (ns frequency) to the database.", UserWarning, stacklevel=8, ) @@ -1413,8 +1412,7 @@ def _get_valid_sqlite_name(name): _SAFE_NAMES_WARNING = ( "The spaces in these column names will not be changed. " - "In pandas versions < 0.14, spaces were converted to " - "underscores." + "In pandas versions < 0.14, spaces were converted to underscores." ) @@ -1528,8 +1526,7 @@ def _sql_type_name(self, col): if col_type == "timedelta64": warnings.warn( "the 'timedelta' type is not supported, and will be " - "written as integer values (ns frequency) to the " - "database.", + "written as integer values (ns frequency) to the database.", UserWarning, stacklevel=8, ) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index b216ee80c3940..2c1222aad12cc 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -639,8 +639,7 @@ def __init__(self, catarray, encoding="latin-1"): if self.text_len > 32000: raise ValueError( "Stata value labels for a single variable must " - "have a combined length less than 32,000 " - "characters." + "have a combined length less than 32,000 characters." ) # Ensure int32 @@ -1729,9 +1728,10 @@ def _do_select_columns(self, data, columns): raise ValueError("columns contains duplicate entries") unmatched = column_set.difference(data.columns) if unmatched: + joined = ", ".join(list(unmatched)) raise ValueError( - "The following columns were not found in the " - "Stata data set: " + ", ".join(list(unmatched)) + "The following columns were not " + f"found in the Stata data set: {joined}" ) # Copy information for retained columns for later processing dtyplist = [] diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 5b37ebb42aecc..a1035fd0823bb 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -421,8 +421,7 @@ def __call__(self): if estimate > self.MAXTICKS * 2: raise RuntimeError( "MillisecondLocator estimated to generate " - f"{estimate:d} ticks from {dmin} to {dmax}: " - "exceeds Locator.MAXTICKS" + f"{estimate:d} ticks from {dmin} to {dmax}: exceeds Locator.MAXTICKS" f"* 2 ({self.MAXTICKS * 2:d}) " ) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index f8b2c7ab123d0..d54fc73b495ba 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -318,8 +318,7 @@ def hist_series( if "figure" in kwds: raise ValueError( "Cannot pass 'figure' when using the " - "'by' argument, since a new 'Figure' instance " - "will be created" + "'by' argument, since a new 'Figure' instance will be created" ) axes = _grouped_hist( self, diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index dd4034a97f58e..d7732c86911b8 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -190,8 +190,7 @@ def _subplots( if sharex or sharey: warnings.warn( "When passing multiple axes, sharex and sharey " - "are ignored. These settings must be specified " - "when creating axes", + "are ignored. These settings must be specified when creating axes", UserWarning, stacklevel=4, ) From 13b22fd94d45afd44045ef77b8c929744efe6a7b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 13 Jan 2020 20:46:11 -0800 Subject: [PATCH 033/158] CLN: de-duplicate _getitem_scalar (#30992) --- pandas/core/indexing.py | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index cc11879142ffe..10e71e72dd885 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1743,12 +1743,14 @@ def _get_slice_axis(self, slice_obj: slice, axis: int): class _LocationIndexer(_NDFrameIndexer): + _takeable: bool = False + def __getitem__(self, key): if type(key) is tuple: key = tuple(com.apply_if_callable(x, self.obj) for x in key) if self._is_scalar_access(key): try: - return self._getitem_scalar(key) + return self.obj._get_value(*key, takeable=self._takeable) except (KeyError, IndexError, AttributeError): # AttributeError for IntervalTree get_value pass @@ -1763,9 +1765,6 @@ def __getitem__(self, key): def _is_scalar_access(self, key: Tuple): raise NotImplementedError() - def _getitem_scalar(self, key): - raise NotImplementedError() - def _getitem_axis(self, key, axis: int): raise NotImplementedError() @@ -1854,12 +1853,6 @@ def _is_scalar_access(self, key: Tuple) -> bool: return True - def _getitem_scalar(self, key): - # a fast-path to scalar access - # if not, raise - values = self.obj._get_value(*key) - return values - def _get_partial_string_timestamp_match_key(self, key, labels): """ Translate any partial string timestamp matches in key, returning the @@ -1965,6 +1958,7 @@ class _iLocIndexer(_LocationIndexer): "point is EXCLUDED), listlike of integers, boolean array" ) _get_slice_axis = _NDFrameIndexer._get_slice_axis + _takeable = True def _validate_key(self, key, axis: int): if com.is_bool_indexer(key): @@ -2029,12 +2023,6 @@ def _is_scalar_access(self, key: Tuple) -> bool: return True - def _getitem_scalar(self, key): - # a fast-path to scalar access - # if not, raise - values = self.obj._get_value(*key, takeable=True) - return values - def _validate_integer(self, key: int, axis: int) -> None: """ Check that 'key' is a valid position in the desired axis. From 0f048cb275aeeed8a202b979a32bb08b07953919 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 14 Jan 2020 00:57:39 -0800 Subject: [PATCH 034/158] CLN: remove geopandas compat code (#30909) --- pandas/core/indexing.py | 71 +-------------------------------- pandas/tests/test_downstream.py | 22 +--------- 2 files changed, 2 insertions(+), 91 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 10e71e72dd885..04503e5d98c10 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -27,7 +27,7 @@ is_list_like_indexer, length_of_indexer, ) -from pandas.core.indexes.api import Index, InvalidIndexError +from pandas.core.indexes.api import Index # "null slice" _NS = slice(None, None) @@ -579,39 +579,6 @@ def __call__(self, axis=None): new_self.axis = axis return new_self - # TODO: remove once geopandas no longer needs this - def __getitem__(self, key): - # Used in ix and downstream in geopandas _CoordinateIndexer - if type(key) is tuple: - # Note: we check the type exactly instead of with isinstance - # because NamedTuple is checked separately. - key = tuple(com.apply_if_callable(x, self.obj) for x in key) - try: - values = self.obj._get_value(*key) - except (KeyError, TypeError, InvalidIndexError, AttributeError): - # TypeError occurs here if the key has non-hashable entries, - # generally slice or list. - # TODO(ix): most/all of the TypeError cases here are for ix, - # so this check can be removed once ix is removed. - # The InvalidIndexError is only catched for compatibility - # with geopandas, see - # https://github.com/pandas-dev/pandas/issues/27258 - # TODO: The AttributeError is for IntervalIndex which - # incorrectly implements get_value, see - # https://github.com/pandas-dev/pandas/issues/27865 - pass - else: - if is_scalar(values): - return values - - return self._getitem_tuple(key) - else: - # we by definition only have the 0th axis - axis = self.axis or 0 - - key = com.apply_if_callable(key, self.obj) - return self._getitem_axis(key, axis=axis) - def _get_label(self, label, axis: int): if self.ndim == 1: # for perf reasons we want to try _xs first @@ -1460,42 +1427,6 @@ def _getitem_nested_tuple(self, tup: Tuple): return obj - # TODO: remove once geopandas no longer needs __getitem__ - def _getitem_axis(self, key, axis: int): - if is_iterator(key): - key = list(key) - self._validate_key(key, axis) - - labels = self.obj._get_axis(axis) - if isinstance(key, slice): - return self._get_slice_axis(key, axis=axis) - elif is_list_like_indexer(key) and not ( - isinstance(key, tuple) and isinstance(labels, ABCMultiIndex) - ): - - if hasattr(key, "ndim") and key.ndim > 1: - raise ValueError("Cannot index with multidimensional key") - - return self._getitem_iterable(key, axis=axis) - else: - - # maybe coerce a float scalar to integer - key = labels._maybe_cast_indexer(key) - - if is_integer(key): - if axis == 0 and isinstance(labels, ABCMultiIndex): - try: - return self._get_label(key, axis=axis) - except (KeyError, TypeError): - if self.obj.index.levels[0].is_integer(): - raise - - # this is the fallback! (for a non-float, non-integer index) - if not labels.is_floating() and not labels.is_integer(): - return self._get_loc(key, axis=axis) - - return self._get_label(key, axis=axis) - def _get_listlike_indexer(self, key, axis: int, raise_missing: bool = False): """ Transform a list-like of keys into a new index and an indexer. diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index ee006233c4c1b..8edd9f20ec63c 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -8,7 +8,7 @@ import numpy as np # noqa import pytest -from pandas import DataFrame, Series +from pandas import DataFrame import pandas._testing as tm @@ -114,26 +114,6 @@ def test_geopandas(): assert geopandas.read_file(fp) is not None -def test_geopandas_coordinate_indexer(): - # this test is included to have coverage of one case in the indexing.py - # code that is only kept for compatibility with geopandas, see - # https://github.com/pandas-dev/pandas/issues/27258 - # We should be able to remove this after some time when its usage is - # removed in geopandas - from pandas.core.indexing import _NDFrameIndexer - - class _CoordinateIndexer(_NDFrameIndexer): - def _getitem_tuple(self, tup): - obj = self.obj - xs, ys = tup - return obj[xs][ys] - - Series._create_indexer("cx", _CoordinateIndexer) - s = Series(range(5)) - res = s.cx[:, :] - tm.assert_series_equal(s, res) - - # Cython import warning @pytest.mark.filterwarnings("ignore:can't resolve:ImportWarning") @pytest.mark.filterwarnings("ignore:RangeIndex.* is deprecated:DeprecationWarning") From b18024d051603702d1fd58b5bdb5b1ac5c7cf119 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Tue, 14 Jan 2020 13:07:24 +0200 Subject: [PATCH 035/158] STY: Whitespaces placed at the beginning instead at the end of a line (#30996) --- pandas/core/arrays/categorical.py | 4 +- pandas/core/arrays/datetimes.py | 13 +++---- pandas/core/arrays/period.py | 4 +- pandas/core/computation/expr.py | 4 +- pandas/core/generic.py | 11 +++--- pandas/core/indexes/base.py | 4 +- pandas/core/indexes/multi.py | 12 +++--- pandas/core/series.py | 4 +- pandas/io/common.py | 4 +- pandas/io/parsers.py | 4 +- .../arrays/categorical/test_operators.py | 8 ++-- pandas/tests/computation/test_eval.py | 12 +++--- pandas/tests/frame/indexing/test_indexing.py | 10 ++--- pandas/tests/frame/test_api.py | 4 +- pandas/tests/frame/test_constructors.py | 6 +-- pandas/tests/frame/test_dtypes.py | 8 ++-- pandas/tests/indexes/common.py | 4 +- .../indexes/datetimes/test_constructors.py | 4 +- pandas/tests/indexes/multi/test_analytics.py | 4 +- pandas/tests/indexes/period/test_indexing.py | 8 ++-- pandas/tests/indexes/test_numeric.py | 4 +- pandas/tests/indexing/test_floats.py | 38 +++++++++---------- pandas/tests/indexing/test_iloc.py | 6 +-- pandas/tests/indexing/test_indexing.py | 8 ++-- pandas/tests/indexing/test_loc.py | 16 ++++---- pandas/tests/indexing/test_partial.py | 8 ++-- pandas/tests/io/test_common.py | 8 ++-- pandas/tests/io/test_stata.py | 20 +++++----- pandas/tests/resample/test_resample_api.py | 4 +- .../tests/scalar/timedelta/test_arithmetic.py | 4 +- .../tests/series/indexing/test_alter_index.py | 4 +- pandas/tests/series/indexing/test_boolean.py | 8 ++-- pandas/tests/series/indexing/test_indexing.py | 4 +- pandas/tests/series/indexing/test_numeric.py | 4 +- pandas/tests/series/methods/test_argsort.py | 4 +- pandas/tests/series/methods/test_isin.py | 4 +- pandas/tests/series/methods/test_replace.py | 4 +- pandas/tests/series/test_alter_axes.py | 4 +- pandas/tests/series/test_dtypes.py | 8 ++-- pandas/tests/series/test_missing.py | 4 +- pandas/tests/test_algos.py | 4 +- pandas/tests/util/test_validate_kwargs.py | 4 +- 42 files changed, 150 insertions(+), 154 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 2806635211459..9d7359dd9c614 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2404,8 +2404,8 @@ def isin(self, values): if not is_list_like(values): values_type = type(values).__name__ raise TypeError( - "only list-like objects are allowed to be passed" - f" to isin(), you passed a [{values_type}]" + "only list-like objects are allowed to be passed " + f"to isin(), you passed a [{values_type}]" ) values = sanitize_array(values, None, None) null_mask = np.asarray(isna(values)) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index e42402b307f28..1988b2e9e33f2 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -234,11 +234,10 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): values = values._data if not isinstance(values, np.ndarray): - msg = ( + raise ValueError( f"Unexpected type '{type(values).__name__}'. 'values' must be " "a DatetimeArray ndarray, or Series or Index containing one of those." ) - raise ValueError(msg) if values.ndim not in [1, 2]: raise ValueError("Only 1-dimensional input arrays are supported.") @@ -249,20 +248,18 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): values = values.view(_NS_DTYPE) if values.dtype != _NS_DTYPE: - msg = ( - "The dtype of 'values' is incorrect. Must be 'datetime64[ns]'." - f" Got {values.dtype} instead." + raise ValueError( + "The dtype of 'values' is incorrect. Must be 'datetime64[ns]'. " + f"Got {values.dtype} instead." ) - raise ValueError(msg) dtype = _validate_dt64_dtype(dtype) if freq == "infer": - msg = ( + raise ValueError( "Frequency inference not allowed in DatetimeArray.__init__. " "Use 'pd.array()' instead." ) - raise ValueError(msg) if copy: values = values.copy() diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 697d759206ff9..1e2a02e988fdd 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -297,8 +297,8 @@ def __arrow_array__(self, type=None): # ensure we have the same freq if self.freqstr != type.freq: raise TypeError( - "Not supported to convert PeriodArray to array with different" - f" 'freq' ({self.freqstr} vs {type.freq})" + "Not supported to convert PeriodArray to array with different " + f"'freq' ({self.freqstr} vs {type.freq})" ) else: raise TypeError( diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 1350587b5ca90..d91586e6c9b81 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -466,8 +466,8 @@ def _maybe_evaluate_binop( if res.has_invalid_return_type: raise TypeError( - f"unsupported operand type(s) for {res.op}:" - f" '{lhs.type}' and '{rhs.type}'" + f"unsupported operand type(s) for {res.op}: " + f"'{lhs.type}' and '{rhs.type}'" ) if self.engine != "pytables": diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ada26b55a778a..c501ada6b5783 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -265,8 +265,8 @@ def _validate_dtype(self, dtype): # a compound dtype if dtype.kind == "V": raise NotImplementedError( - "compound dtypes are not implemented" - f" in the {type(self).__name__} constructor" + "compound dtypes are not implemented " + f"in the {type(self).__name__} constructor" ) return dtype @@ -8993,11 +8993,10 @@ def tshift( new_data = self._data.copy() new_data.axes[block_axis] = index.shift(periods) elif orig_freq is not None: - msg = ( - f"Given freq {freq.rule_code} does not match" - f" PeriodIndex freq {orig_freq.rule_code}" + raise ValueError( + f"Given freq {freq.rule_code} does not match " + f"PeriodIndex freq {orig_freq.rule_code}" ) - raise ValueError(msg) else: new_data = self._data.copy() new_data.axes[block_axis] = index.shift(periods, freq) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 62e3fd28f6684..f2f53f564da76 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4790,8 +4790,8 @@ def get_slice_bound(self, label, side, kind): if side not in ("left", "right"): raise ValueError( - f"Invalid value for side kwarg, must be either" - f" 'left' or 'right': {side}" + "Invalid value for side kwarg, must be either " + f"'left' or 'right': {side}" ) original_label = label diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 84d7399cc4f2d..21421a6f6ea62 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1288,8 +1288,8 @@ def _get_level_number(self, level) -> int: if level < 0: orig_level = level - self.nlevels raise IndexError( - f"Too many levels: Index has only {self.nlevels} levels," - f" {orig_level} is not a valid level number" + f"Too many levels: Index has only {self.nlevels} levels, " + f"{orig_level} is not a valid level number" ) # Note: levels are zero-based elif level >= self.nlevels: @@ -2171,8 +2171,8 @@ def reorder_levels(self, order): order = [self._get_level_number(i) for i in order] if len(order) != self.nlevels: raise AssertionError( - f"Length of order must be same as number of levels ({self.nlevels})," - f" got {len(order)}" + f"Length of order must be same as number of levels ({self.nlevels}), " + f"got {len(order)}" ) new_levels = [self.levels[i] for i in order] new_codes = [self.codes[i] for i in order] @@ -2527,8 +2527,8 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): def _partial_tup_index(self, tup, side="left"): if len(tup) > self.lexsort_depth: raise UnsortedIndexError( - f"Key length ({len(tup)}) was greater than MultiIndex lexsort depth" - f" ({self.lexsort_depth})" + f"Key length ({len(tup)}) was greater than MultiIndex lexsort depth " + f"({self.lexsort_depth})" ) n = len(tup) diff --git a/pandas/core/series.py b/pandas/core/series.py index fe5c5fd5e2bc8..33565bbedade6 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1404,8 +1404,8 @@ def to_string( # catch contract violations if not isinstance(result, str): raise AssertionError( - "result must be of type str, type" - f" of result is {repr(type(result).__name__)}" + "result must be of type str, type " + f"of result is {repr(type(result).__name__)}" ) if buf is None: diff --git a/pandas/io/common.py b/pandas/io/common.py index 6a764ff252dea..cf19169214c35 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -406,8 +406,8 @@ def get_handle( raise ValueError(f"Zero files found in ZIP file {path_or_buf}") else: raise ValueError( - "Multiple files found in ZIP file." - f" Only one file per ZIP: {zip_names}" + "Multiple files found in ZIP file. " + f"Only one file per ZIP: {zip_names}" ) # XZ Compression diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 62b82f174e17c..41db6ed0ef503 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -906,8 +906,8 @@ def _get_options_with_defaults(self, engine): pass else: raise ValueError( - f"The {repr(argname)} option is not supported with the" - f" {repr(engine)} engine" + f"The {repr(argname)} option is not supported with the " + f"{repr(engine)} engine" ) else: value = _deprecated_defaults.get(argname, default) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 8643e7f6f89c1..0c830c65e0f8b 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -97,8 +97,8 @@ def test_comparisons(self): # comparison (in both directions) with Series will raise s = Series(["b", "b", "b"]) msg = ( - "Cannot compare a Categorical for op __gt__ with type" - r" " + "Cannot compare a Categorical for op __gt__ with type " + r"" ) with pytest.raises(TypeError, match=msg): cat > s @@ -265,8 +265,8 @@ def test_comparisons(self, data, reverse, base): # categorical cannot be compared to Series or numpy array, and also # not the other way around msg = ( - "Cannot compare a Categorical for op __gt__ with type" - r" " + "Cannot compare a Categorical for op __gt__ with type " + r"" ) with pytest.raises(TypeError, match=msg): cat > s diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 7f68abb92ba43..656b274aa1a9e 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -274,9 +274,9 @@ def check_operands(left, right, cmp_op): def check_simple_cmp_op(self, lhs, cmp1, rhs): ex = f"lhs {cmp1} rhs" msg = ( - r"only list-like( or dict-like)? objects are allowed to be" - r" passed to (DataFrame\.)?isin\(\), you passed a" - r" (\[|')bool(\]|')|" + r"only list-like( or dict-like)? objects are allowed to be " + r"passed to (DataFrame\.)?isin\(\), you passed a " + r"(\[|')bool(\]|')|" "argument of type 'bool' is not iterable" ) if cmp1 in ("in", "not in") and not is_list_like(rhs): @@ -408,9 +408,9 @@ def check_compound_invert_op(self, lhs, cmp1, rhs): ex = f"~(lhs {cmp1} rhs)" msg = ( - r"only list-like( or dict-like)? objects are allowed to be" - r" passed to (DataFrame\.)?isin\(\), you passed a" - r" (\[|')float(\]|')|" + r"only list-like( or dict-like)? objects are allowed to be " + r"passed to (DataFrame\.)?isin\(\), you passed a " + r"(\[|')float(\]|')|" "argument of type 'float' is not iterable" ) if is_scalar(rhs) and cmp1 in skip_these: diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 33c0e92845484..40ecda7d74952 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -447,8 +447,8 @@ def test_setitem(self, float_frame): tm.assert_series_equal(series, float_frame["col6"], check_names=False) msg = ( - r"\"None of \[Float64Index\(\[.*dtype='float64'\)\] are in the" - r" \[columns\]\"" + r"\"None of \[Float64Index\(\[.*dtype='float64'\)\] are in the " + r"\[columns\]\"" ) with pytest.raises(KeyError, match=msg): float_frame[np.random.randn(len(float_frame) + 1)] = 1 @@ -1039,9 +1039,9 @@ def test_getitem_setitem_float_labels(self): # positional slicing only via iloc! msg = ( - "cannot do slice indexing on" - r" with" - r" these indexers \[1.0\] of " + "cannot do slice indexing on " + r" with " + r"these indexers \[1.0\] of " ) with pytest.raises(TypeError, match=msg): df.iloc[1.0:5] diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 9263409f7a7f8..9de5d6fe16a0d 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -382,8 +382,8 @@ def test_swapaxes(self): tm.assert_frame_equal(df.T, df.swapaxes(1, 0)) tm.assert_frame_equal(df, df.swapaxes(0, 0)) msg = ( - "No axis named 2 for object type" - r" " + "No axis named 2 for object type " + r"" ) with pytest.raises(ValueError, match=msg): df.swapaxes(2, 5) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index ea1e339f44d93..a861e0eb52391 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1854,9 +1854,9 @@ def check(df): # No NaN found -> error if len(indexer) == 0: msg = ( - "cannot do label indexing on" - r" " - r" with these indexers \[nan\] of " + "cannot do label indexing on " + r" " + r"with these indexers \[nan\] of " ) with pytest.raises(TypeError, match=msg): df.loc[:, np.nan] diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 06bb040224455..0d34f61ef1e5a 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -897,15 +897,15 @@ def test_astype_to_incorrect_datetimelike(self, unit): df = DataFrame(np.array([[1, 2, 3]], dtype=dtype)) msg = ( - r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" - r" \[timedelta64\[{}\]\]" + r"cannot astype a datetimelike from \[datetime64\[ns\]\] to " + r"\[timedelta64\[{}\]\]" ).format(unit) with pytest.raises(TypeError, match=msg): df.astype(other) msg = ( - r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" - r" \[datetime64\[{}\]\]" + r"cannot astype a timedelta from \[timedelta64\[ns\]\] to " + r"\[datetime64\[{}\]\]" ).format(unit) df = DataFrame(np.array([[1, 2, 3]], dtype=other)) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index a16017b0e12c0..afc068d6696ef 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -37,8 +37,8 @@ class Base: def test_pickle_compat_construction(self): # need an object to create with msg = ( - r"Index\(\.\.\.\) must be called with a collection of some" - r" kind, None was passed|" + r"Index\(\.\.\.\) must be called with a collection of some " + r"kind, None was passed|" r"__new__\(\) missing 1 required positional argument: 'data'|" r"__new__\(\) takes at least 2 arguments \(1 given\)" ) diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index ffe51dd1fb9f5..95d14ad4c86f7 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -644,8 +644,8 @@ def test_constructor_dtype(self): ) msg = ( - "cannot supply both a tz and a timezone-naive dtype" - r" \(i\.e\. datetime64\[ns\]\)" + "cannot supply both a tz and a timezone-naive dtype " + r"\(i\.e\. datetime64\[ns\]\)" ) with pytest.raises(ValueError, match=msg): DatetimeIndex(idx, dtype="datetime64[ns]") diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py index ac1e0893683d1..209cc627aba8b 100644 --- a/pandas/tests/indexes/multi/test_analytics.py +++ b/pandas/tests/indexes/multi/test_analytics.py @@ -334,8 +334,8 @@ def test_numpy_ufuncs(idx, func): else: expected_exception = TypeError msg = ( - "loop of ufunc does not support argument 0 of type tuple which" - f" has no callable {func.__name__} method" + "loop of ufunc does not support argument 0 of type tuple which " + f"has no callable {func.__name__} method" ) with pytest.raises(expected_exception, match=msg): func(idx) diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 7dbefbdaff98e..2e3bf852667e5 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -409,8 +409,8 @@ def test_get_loc(self): idx0.get_loc(1.1) msg = ( - r"'PeriodIndex\(\['2017-09-01', '2017-09-02', '2017-09-03'\]," - r" dtype='period\[D\]', freq='D'\)' is an invalid key" + r"'PeriodIndex\(\['2017-09-01', '2017-09-02', '2017-09-03'\], " + r"dtype='period\[D\]', freq='D'\)' is an invalid key" ) with pytest.raises(TypeError, match=msg): idx0.get_loc(idx0) @@ -434,8 +434,8 @@ def test_get_loc(self): idx1.get_loc(1.1) msg = ( - r"'PeriodIndex\(\['2017-09-02', '2017-09-02', '2017-09-03'\]," - r" dtype='period\[D\]', freq='D'\)' is an invalid key" + r"'PeriodIndex\(\['2017-09-02', '2017-09-02', '2017-09-03'\], " + r"dtype='period\[D\]', freq='D'\)' is an invalid key" ) with pytest.raises(TypeError, match=msg): idx1.get_loc(idx1) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index f025168643ab9..582f6c619d287 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -188,8 +188,8 @@ def test_constructor_invalid(self): # invalid msg = ( - r"Float64Index\(\.\.\.\) must be called with a collection of" - r" some kind, 0\.0 was passed" + r"Float64Index\(\.\.\.\) must be called with a collection of " + r"some kind, 0\.0 was passed" ) with pytest.raises(TypeError, match=msg): Float64Index(0.0) diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index 2cc8232566aa9..5530896a90941 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -123,9 +123,9 @@ def test_scalar_non_numeric(self): # setting with a float fails with iloc msg = ( - r"cannot do (label|index|positional) indexing" - r" on {klass} with these indexers \[3\.0\] of" - r" {kind}".format(klass=type(i), kind=str(float)) + r"cannot do (label|index|positional) indexing " + r"on {klass} with these indexers \[3\.0\] of " + r"{kind}".format(klass=type(i), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s.iloc[3.0] = 0 @@ -160,9 +160,9 @@ def test_scalar_non_numeric(self): s = Series(np.arange(len(i)), index=i) s[3] msg = ( - r"cannot do (label|index) indexing" - r" on {klass} with these indexers \[3\.0\] of" - r" {kind}".format(klass=type(i), kind=str(float)) + r"cannot do (label|index) indexing " + r"on {klass} with these indexers \[3\.0\] of " + r"{kind}".format(klass=type(i), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[3.0] @@ -177,9 +177,9 @@ def test_scalar_with_mixed(self): for idxr in [lambda x: x, lambda x: x.iloc]: msg = ( - r"cannot do label indexing" - r" on {klass} with these indexers \[1\.0\] of" - r" {kind}|" + r"cannot do label indexing " + r"on {klass} with these indexers \[1\.0\] of " + r"{kind}|" "Cannot index by location index with a non-integer key".format( klass=str(Index), kind=str(float) ) @@ -199,9 +199,9 @@ def test_scalar_with_mixed(self): for idxr in [lambda x: x]: msg = ( - r"cannot do label indexing" - r" on {klass} with these indexers \[1\.0\] of" - r" {kind}".format(klass=str(Index), kind=str(float)) + r"cannot do label indexing " + r"on {klass} with these indexers \[1\.0\] of " + r"{kind}".format(klass=str(Index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): idxr(s3)[1.0] @@ -313,9 +313,9 @@ def test_scalar_float(self): s.iloc[3.0] msg = ( - r"cannot do positional indexing" - r" on {klass} with these indexers \[3\.0\] of" - r" {kind}".format(klass=str(Float64Index), kind=str(float)) + r"cannot do positional indexing " + r"on {klass} with these indexers \[3\.0\] of " + r"{kind}".format(klass=str(Float64Index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s2.iloc[3.0] = 0 @@ -379,10 +379,10 @@ def test_slice_non_numeric(self): for idxr in [lambda x: x.loc, lambda x: x.iloc, lambda x: x]: msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers" - r" \[(3|4)(\.0)?\]" - r" of ({kind_float}|{kind_int})".format( + "cannot do slice indexing " + r"on {klass} with these indexers " + r"\[(3|4)(\.0)?\] " + r"of ({kind_float}|{kind_int})".format( klass=type(index), kind_float=str(float), kind_int=str(int), diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 26dedf02e7333..48c25ec034653 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -437,9 +437,9 @@ def test_iloc_getitem_labelled_frame(self): # trying to use a label msg = ( - r"Location based indexing can only have \[integer, integer" - r" slice \(START point is INCLUDED, END point is EXCLUDED\)," - r" listlike of integers, boolean array\] types" + r"Location based indexing can only have \[integer, integer " + r"slice \(START point is INCLUDED, END point is EXCLUDED\), " + r"listlike of integers, boolean array\] types" ) with pytest.raises(ValueError, match=msg): df.iloc["j", "D"] diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 448a06070c45c..1913caae93932 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -81,8 +81,8 @@ def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): nd3 = np.random.randint(5, size=(2, 2, 2)) msg = ( - r"Buffer has wrong number of dimensions \(expected 1," - r" got 3\)|" + r"Buffer has wrong number of dimensions \(expected 1, " + r"got 3\)|" "Cannot index with multidimensional key|" r"Wrong number of dimensions. values.ndim != ndim \[3 != 1\]|" "Index data must be 1-dimensional" @@ -134,8 +134,8 @@ def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id): nd3 = np.random.randint(5, size=(2, 2, 2)) msg = ( - r"Buffer has wrong number of dimensions \(expected 1," - r" got 3\)|" + r"Buffer has wrong number of dimensions \(expected 1, " + r"got 3\)|" "'pandas._libs.interval.IntervalTree' object has no attribute " "'set_value'|" # AttributeError "unhashable type: 'numpy.ndarray'|" # TypeError diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index a36078b11c663..78fcd15ab4cc1 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -219,8 +219,8 @@ def test_loc_to_fail(self): # raise a KeyError? msg = ( - r"\"None of \[Int64Index\(\[1, 2\], dtype='int64'\)\] are" - r" in the \[index\]\"" + r"\"None of \[Int64Index\(\[1, 2\], dtype='int64'\)\] are " + r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): df.loc[[1, 2], [1, 2]] @@ -236,8 +236,8 @@ def test_loc_to_fail(self): s.loc[-1] msg = ( - r"\"None of \[Int64Index\(\[-1, -2\], dtype='int64'\)\] are" - r" in the \[index\]\"" + r"\"None of \[Int64Index\(\[-1, -2\], dtype='int64'\)\] are " + r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): s.loc[[-1, -2]] @@ -252,8 +252,8 @@ def test_loc_to_fail(self): s["a"] = 2 msg = ( - r"\"None of \[Int64Index\(\[-2\], dtype='int64'\)\] are" - r" in the \[index\]\"" + r"\"None of \[Int64Index\(\[-2\], dtype='int64'\)\] are " + r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): s.loc[[-2]] @@ -268,8 +268,8 @@ def test_loc_to_fail(self): df = DataFrame([["a"], ["b"]], index=[1, 2], columns=["value"]) msg = ( - r"\"None of \[Int64Index\(\[3\], dtype='int64'\)\] are" - r" in the \[index\]\"" + r"\"None of \[Int64Index\(\[3\], dtype='int64'\)\] are " + r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): df.loc[[3], :] diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 5fda759020f1a..2ce07ec41758f 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -205,8 +205,8 @@ def test_series_partial_set(self): # raises as nothing in in the index msg = ( - r"\"None of \[Int64Index\(\[3, 3, 3\], dtype='int64'\)\] are" - r" in the \[index\]\"" + r"\"None of \[Int64Index\(\[3, 3, 3\], dtype='int64'\)\] are " + r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): ser.loc[[3, 3, 3]] @@ -286,8 +286,8 @@ def test_series_partial_set_with_name(self): # raises as nothing in in the index msg = ( - r"\"None of \[Int64Index\(\[3, 3, 3\], dtype='int64'," - r" name='idx'\)\] are in the \[index\]\"" + r"\"None of \[Int64Index\(\[3, 3, 3\], dtype='int64', " + r"name='idx'\)\] are in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): ser.loc[[3, 3, 3]] diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index a126f83164ce5..22aa78919ef0f 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -146,8 +146,8 @@ def test_read_non_existant(self, reader, module, error_class, fn_ext): msg3 = "Expected object or value" msg4 = "path_or_buf needs to be a string file path or file-like" msg5 = ( - fr"\[Errno 2\] File .+does_not_exist\.{fn_ext} does not exist:" - fr" '.+does_not_exist\.{fn_ext}'" + fr"\[Errno 2\] File .+does_not_exist\.{fn_ext} does not exist: " + fr"'.+does_not_exist\.{fn_ext}'" ) msg6 = fr"\[Errno 2\] 没有那个文件或目录: '.+does_not_exist\.{fn_ext}'" msg7 = ( @@ -186,8 +186,8 @@ def test_read_expands_user_home_dir( msg3 = "Unexpected character found when decoding 'false'" msg4 = "path_or_buf needs to be a string file path or file-like" msg5 = ( - fr"\[Errno 2\] File .+does_not_exist\.{fn_ext} does not exist:" - fr" '.+does_not_exist\.{fn_ext}'" + fr"\[Errno 2\] File .+does_not_exist\.{fn_ext} does not exist: " + fr"'.+does_not_exist\.{fn_ext}'" ) msg6 = fr"\[Errno 2\] 没有那个文件或目录: '.+does_not_exist\.{fn_ext}'" msg7 = ( diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 1d3cddbf01738..5e4ec116645b0 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -737,10 +737,10 @@ def test_excessively_long_string(self): ) original = DataFrame(s) msg = ( - r"Fixed width strings in Stata \.dta files are limited to 244" - r" \(or fewer\)\ncharacters\. Column 's500' does not satisfy" - r" this restriction\. Use the\n'version=117' parameter to write" - r" the newer \(Stata 13 and later\) format\." + r"Fixed width strings in Stata \.dta files are limited to 244 " + r"\(or fewer\)\ncharacters\. Column 's500' does not satisfy " + r"this restriction\. Use the\n'version=117' parameter to write " + r"the newer \(Stata 13 and later\) format\." ) with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: @@ -968,8 +968,8 @@ def test_categorical_warnings_and_errors(self): ) with tm.ensure_clean() as path: msg = ( - "Stata value labels for a single variable must have" - r" a combined length less than 32,000 characters\." + "Stata value labels for a single variable must have " + r"a combined length less than 32,000 characters\." ) with pytest.raises(ValueError, match=msg): original.to_stata(path) @@ -1714,12 +1714,12 @@ def test_invalid_file_not_written(self, version): df = DataFrame([content], columns=["invalid"]) with tm.ensure_clean() as path: msg1 = ( - r"'latin-1' codec can't encode character '\\ufffd'" - r" in position 14: ordinal not in range\(256\)" + r"'latin-1' codec can't encode character '\\ufffd' " + r"in position 14: ordinal not in range\(256\)" ) msg2 = ( - "'ascii' codec can't decode byte 0xef in position 14:" - r" ordinal not in range\(128\)" + "'ascii' codec can't decode byte 0xef in position 14: " + r"ordinal not in range\(128\)" ) with pytest.raises(UnicodeEncodeError, match=r"{}|{}".format(msg1, msg2)): with tm.assert_produces_warning(ResourceWarning): diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 170201b4f8e5c..d552241f9126f 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -257,8 +257,8 @@ def test_fillna(): tm.assert_series_equal(result, expected) msg = ( - r"Invalid fill method\. Expecting pad \(ffill\), backfill" - r" \(bfill\) or nearest\. Got 0" + r"Invalid fill method\. Expecting pad \(ffill\), backfill " + r"\(bfill\) or nearest\. Got 0" ) with pytest.raises(ValueError, match=msg): r.fillna(0) diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index 3764d9b7548fc..ce08a47f824ee 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -271,8 +271,8 @@ def test_ops_ndarray(self): tm.assert_numpy_array_equal(td * np.array([2]), expected) tm.assert_numpy_array_equal(np.array([2]) * td, expected) msg = ( - "ufunc '?multiply'? cannot use operands with types" - r" dtype\(' with these indexers \[{key}\]" - r" of " + r"\.datetimes\.DatetimeIndex'> with these indexers \[{key}\] " + r"of " ) with pytest.raises(TypeError, match=msg.format(key=r"4\.0")): datetime_series[4.0:10.0] diff --git a/pandas/tests/series/methods/test_argsort.py b/pandas/tests/series/methods/test_argsort.py index 1fc98ded0d3d2..62273e2d363fb 100644 --- a/pandas/tests/series/methods/test_argsort.py +++ b/pandas/tests/series/methods/test_argsort.py @@ -52,8 +52,8 @@ def test_argsort_stable(self): tm.assert_series_equal(mindexer, Series(mexpected), check_dtype=False) tm.assert_series_equal(qindexer, Series(qexpected), check_dtype=False) msg = ( - r"ndarray Expected type ," - r" found instead" + r"ndarray Expected type , " + r"found instead" ) with pytest.raises(AssertionError, match=msg): tm.assert_numpy_array_equal(qindexer, mindexer) diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index ca93e989ba6b5..3836c1d56bf87 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -29,8 +29,8 @@ def test_isin_with_string_scalar(self): # GH#4763 s = Series(["A", "B", "C", "a", "B", "B", "A", "C"]) msg = ( - r"only list-like objects are allowed to be passed to isin\(\)," - r" you passed a \[str\]" + r"only list-like objects are allowed to be passed to isin\(\), " + r"you passed a \[str\]" ) with pytest.raises(TypeError, match=msg): s.isin("a") diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index b20baa2836363..770ad38b0215e 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -120,8 +120,8 @@ def test_replace_with_single_list(self): # make sure things don't get corrupted when fillna call fails s = ser.copy() msg = ( - r"Invalid fill method\. Expecting pad \(ffill\) or backfill" - r" \(bfill\)\. Got crash_cymbal" + r"Invalid fill method\. Expecting pad \(ffill\) or backfill " + r"\(bfill\)\. Got crash_cymbal" ) with pytest.raises(ValueError, match=msg): s.replace([1, 2, 3], inplace=True, method="crash_cymbal") diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index 628c66583535d..71f6681e8c955 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -11,8 +11,8 @@ class TestSeriesAlterAxes: def test_setindex(self, string_series): # wrong type msg = ( - r"Index\(\.\.\.\) must be called with a collection of some" - r" kind, None was passed" + r"Index\(\.\.\.\) must be called with a collection of some " + r"kind, None was passed" ) with pytest.raises(TypeError, match=msg): string_series.index = None diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index a57ec2ba05d54..1fc582156a884 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -193,8 +193,8 @@ def test_astype_dict_like(self, dtype_class): dt3 = dtype_class({"abc": str, "def": str}) msg = ( - "Only the Series name can be used for the key in Series dtype" - r" mappings\." + "Only the Series name can be used for the key in Series dtype " + r"mappings\." ) with pytest.raises(KeyError, match=msg): s.astype(dt3) @@ -410,8 +410,8 @@ def test_arg_for_errors_in_astype(self): s = Series([1, 2, 3]) msg = ( - r"Expected value of kwarg 'errors' to be one of \['raise'," - r" 'ignore'\]\. Supplied value is 'False'" + r"Expected value of kwarg 'errors' to be one of \['raise', " + r"'ignore'\]\. Supplied value is 'False'" ) with pytest.raises(ValueError, match=msg): s.astype(np.float64, errors=False) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index d8eeefcbdce7b..6b7d9e00a5228 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1324,8 +1324,8 @@ def test_interp_limit_bad_direction(self): s = Series([1, 3, np.nan, np.nan, np.nan, 11]) msg = ( - r"Invalid limit_direction: expecting one of \['forward'," - r" 'backward', 'both'\], got 'abc'" + r"Invalid limit_direction: expecting one of \['forward', " + r"'backward', 'both'\], got 'abc'" ) with pytest.raises(ValueError, match=msg): s.interpolate(method="linear", limit=2, limit_direction="abc") diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 2b46f86d49c5e..6c7f8c9b0475e 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -653,8 +653,8 @@ class TestIsin: def test_invalid(self): msg = ( - r"only list-like objects are allowed to be passed to isin\(\)," - r" you passed a \[int\]" + r"only list-like objects are allowed to be passed to isin\(\), " + r"you passed a \[int\]" ) with pytest.raises(TypeError, match=msg): algos.isin(1, 1) diff --git a/pandas/tests/util/test_validate_kwargs.py b/pandas/tests/util/test_validate_kwargs.py index a7b6d8f98cc60..8fe2a3712bf49 100644 --- a/pandas/tests/util/test_validate_kwargs.py +++ b/pandas/tests/util/test_validate_kwargs.py @@ -49,8 +49,8 @@ def test_validation(): @pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0]) def test_validate_bool_kwarg_fail(name, value): msg = ( - f'For argument "{name}" expected type bool,' - f" received type {type(value).__name__}" + f'For argument "{name}" expected type bool, ' + f"received type {type(value).__name__}" ) with pytest.raises(ValueError, match=msg): From 3471270b1088e552879f5f292a31cc763c1549bb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 14 Jan 2020 06:39:47 -0600 Subject: [PATCH 036/158] API: Disallow NaN in StringArray constructor (#30980) --- pandas/_libs/lib.pyx | 4 +++ pandas/core/arrays/string_.py | 34 ++++++++++++++-------- pandas/core/strings.py | 8 +++-- pandas/tests/arrays/string_/test_string.py | 19 ++++++++++++ pandas/tests/dtypes/test_inference.py | 7 ++++- pandas/tests/test_strings.py | 2 +- 6 files changed, 58 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 719db5c03f07f..acd74591134bc 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1624,6 +1624,10 @@ cdef class StringValidator(Validator): cdef inline bint is_array_typed(self) except -1: return issubclass(self.dtype.type, np.str_) + cdef bint is_valid_null(self, object value) except -1: + # We deliberately exclude None / NaN here since StringArray uses NA + return value is C_NA + cpdef bint is_string_array(ndarray values, bint skipna=False): cdef: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 84130132de4dc..c485d1f50dc9d 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -93,9 +93,6 @@ class StringArray(PandasArray): StringArray is considered experimental. The implementation and parts of the API may change without warning. - In particular, the NA value used may change to no longer be - ``numpy.nan``. - Parameters ---------- values : array-like @@ -104,8 +101,11 @@ class StringArray(PandasArray): .. warning:: Currently, this expects an object-dtype ndarray - where the elements are Python strings. This may - change without warning in the future. + where the elements are Python strings or :attr:`pandas.NA`. + This may change without warning in the future. Use + :meth:`pandas.array` with ``dtype="string"`` for a stable way of + creating a `StringArray` from any sequence. + copy : bool, default False Whether to copy the array of data. @@ -119,6 +119,8 @@ class StringArray(PandasArray): See Also -------- + array + The recommended function for creating a StringArray. Series.str The string methods are available on Series backed by a StringArray. @@ -165,12 +167,10 @@ def __init__(self, values, copy=False): def _validate(self): """Validate that we only store NA or strings.""" if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): - raise ValueError( - "StringArray requires a sequence of strings or missing values." - ) + raise ValueError("StringArray requires a sequence of strings or pandas.NA") if self._ndarray.dtype != "object": raise ValueError( - "StringArray requires a sequence of strings. Got " + "StringArray requires a sequence of strings or pandas.NA. Got " f"'{self._ndarray.dtype}' dtype instead." ) @@ -178,12 +178,22 @@ def _validate(self): def _from_sequence(cls, scalars, dtype=None, copy=False): if dtype: assert dtype == "string" - result = super()._from_sequence(scalars, dtype=object, copy=copy) + + result = np.asarray(scalars, dtype="object") + if copy and result is scalars: + result = result.copy() + # Standardize all missing-like values to NA # TODO: it would be nice to do this in _validate / lib.is_string_array # We are already doing a scan over the values there. - result[result.isna()] = StringDtype.na_value - return result + na_values = isna(result) + if na_values.any(): + if result is scalars: + # force a copy now, if we haven't already + result = result.copy() + result[na_values] = StringDtype.na_value + + return cls(result) @classmethod def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): diff --git a/pandas/core/strings.py b/pandas/core/strings.py index f8d9eeb211a1e..0323eafff8dee 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -8,6 +8,7 @@ import numpy as np import pandas._libs.lib as lib +import pandas._libs.missing as libmissing import pandas._libs.ops as libops from pandas._typing import ArrayLike, Dtype from pandas.util._decorators import Appender @@ -118,12 +119,15 @@ def cat_safe(list_of_columns: List, sep: str): return result -def _na_map(f, arr, na_result=np.nan, dtype=object): - # should really _check_ for NA +def _na_map(f, arr, na_result=None, dtype=object): if is_extension_array_dtype(arr.dtype): + if na_result is None: + na_result = libmissing.NA # just StringDtype arr = extract_array(arr) return _map_stringarray(f, arr, na_value=na_result, dtype=dtype) + if na_result is None: + na_result = np.nan return _map_object(f, arr, na_mask=True, na_value=na_result, dtype=dtype) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 33e68f029922e..5e2f14af341ab 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -194,6 +194,25 @@ def test_constructor_raises(): with pytest.raises(ValueError, match="sequence of strings"): pd.arrays.StringArray(np.array([])) + with pytest.raises(ValueError, match="strings or pandas.NA"): + pd.arrays.StringArray(np.array(["a", np.nan], dtype=object)) + + with pytest.raises(ValueError, match="strings or pandas.NA"): + pd.arrays.StringArray(np.array(["a", None], dtype=object)) + + with pytest.raises(ValueError, match="strings or pandas.NA"): + pd.arrays.StringArray(np.array(["a", pd.NaT], dtype=object)) + + +@pytest.mark.parametrize("copy", [True, False]) +def test_from_sequence_no_mutate(copy): + a = np.array(["a", np.nan], dtype=object) + original = a.copy() + result = pd.arrays.StringArray._from_sequence(a, copy=copy) + expected = pd.arrays.StringArray(np.array(["a", pd.NA], dtype=object)) + tm.assert_extension_array_equal(result, expected) + tm.assert_numpy_array_equal(a, original) + @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.xfail(reason="Not implemented StringArray.sum") diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index d022b0e97877a..5eb85de2b90f5 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1114,11 +1114,16 @@ def test_is_string_array(self): assert lib.is_string_array(np.array(["foo", "bar"])) assert not lib.is_string_array( - np.array(["foo", "bar", np.nan], dtype=object), skipna=False + np.array(["foo", "bar", pd.NA], dtype=object), skipna=False ) assert lib.is_string_array( + np.array(["foo", "bar", pd.NA], dtype=object), skipna=True + ) + # NaN is not valid for string array, just NA + assert not lib.is_string_array( np.array(["foo", "bar", np.nan], dtype=object), skipna=True ) + assert not lib.is_string_array(np.array([1, 2])) def test_to_object_array_tuples(self): diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index a92f917820bd0..c37c78f3b9235 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3521,7 +3521,7 @@ def test_string_array(any_string_method): if isinstance(expected, Series): if expected.dtype == "object" and lib.is_string_array( - expected.values, skipna=True + expected.dropna().values, ): assert result.dtype == "string" result = result.astype(object) From 81d96369753f5244f011e41bf12b439a53f68852 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 14 Jan 2020 04:52:25 -0800 Subject: [PATCH 037/158] BUG: ensure_datetime64ns with bigendian array (#30976) --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/_libs/tslibs/conversion.pyx | 5 +++++ pandas/tests/series/test_constructors.py | 9 +++++++++ pandas/tests/tslibs/test_conversion.py | 9 +++++++++ 4 files changed, 24 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index c3ee72f6442fc..14f82c2e71519 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -82,7 +82,7 @@ Numeric Conversion ^^^^^^^^^^ - +- Bug in :class:`Series` construction from NumPy array with big-endian ``datetime64`` dtype (:issue:`29684`) - - diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 2988d7bae9a5e..a2b433c2007ff 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -99,6 +99,11 @@ def ensure_datetime64ns(arr: ndarray, copy: bool=True): shape = (arr).shape + if (arr).dtype.byteorder == ">": + # GH#29684 we incorrectly get OutOfBoundsDatetime if we dont swap + dtype = arr.dtype + arr = arr.astype(dtype.newbyteorder("<")) + ivalues = arr.view(np.int64).ravel() result = np.empty(shape, dtype=NS_DTYPE) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index c38e5708be09b..d760939657d47 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -967,6 +967,15 @@ def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): expected = Series(pd.Timestamp(arg)).dt.tz_localize("CET") tm.assert_series_equal(result, expected) + def test_constructor_datetime64_bigendian(self): + # GH#30976 + ms = np.datetime64(1, "ms") + arr = np.array([np.datetime64(1, "ms")], dtype=">M8[ms]") + + result = Series(arr) + expected = Series([Timestamp(ms)]) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("interval_constructor", [IntervalIndex, IntervalArray]) def test_construction_interval(self, interval_constructor): # construction from interval & array of intervals diff --git a/pandas/tests/tslibs/test_conversion.py b/pandas/tests/tslibs/test_conversion.py index 2beeae85de683..96c2d6bbd8106 100644 --- a/pandas/tests/tslibs/test_conversion.py +++ b/pandas/tests/tslibs/test_conversion.py @@ -72,6 +72,15 @@ def test_length_zero_copy(dtype, copy): assert result.base is (None if copy else arr) +def test_ensure_datetime64ns_bigendian(): + # GH#29684 + arr = np.array([np.datetime64(1, "ms")], dtype=">M8[ms]") + result = conversion.ensure_datetime64ns(arr) + + expected = np.array([np.datetime64(1, "ms")], dtype="M8[ns]") + tm.assert_numpy_array_equal(result, expected) + + class SubDatetime(datetime): pass From d78c061669b7c1a0a50a393a8e58d660d84a79f9 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Tue, 14 Jan 2020 17:02:31 +0100 Subject: [PATCH 038/158] ENH: Add Stata 119 writer (#30959) --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/frame.py | 40 +++++++----- pandas/io/stata.py | 108 ++++++++++++++++++++++---------- pandas/tests/io/test_stata.py | 111 ++++++++++++++++++--------------- 4 files changed, 163 insertions(+), 98 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 0879189a822f8..c423933d4c438 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -223,7 +223,7 @@ Other enhancements - :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` have gained ``ignore_index`` keyword to be able to reset index after sorting (:issue:`30114`) - :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` have gained ``ignore_index`` keyword to reset index (:issue:`30114`) - :meth:`DataFrame.drop_duplicates` has gained ``ignore_index`` keyword to reset index (:issue:`30114`) -- Added new writer for exporting Stata dta files in version 118, ``StataWriter118``. This format supports exporting strings containing Unicode characters (:issue:`23573`) +- Added new writer for exporting Stata dta files in versions 118 and 119, ``StataWriterUTF8``. These files formats support exporting strings containing Unicode characters. Format 119 supports data sets with more than 32,767 variables (:issue:`23573`, :issue:`30959`) - :meth:`Series.map` now accepts ``collections.abc.Mapping`` subclasses as a mapper (:issue:`29733`) - Added an experimental :attr:`~DataFrame.attrs` for storing global metadata about a dataset (:issue:`29062`) - :meth:`Timestamp.fromisocalendar` is now compatible with python 3.8 and above (:issue:`28115`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 594b8a00a8672..42dc21156ba59 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1898,14 +1898,22 @@ def to_stata( variable_labels : dict Dictionary containing columns as keys and variable labels as values. Each label must be 80 characters or smaller. - version : {114, 117}, default 114 - Version to use in the output dta file. Version 114 can be used - read by Stata 10 and later. Version 117 can be read by Stata 13 - or later. Version 114 limits string variables to 244 characters or - fewer while 117 allows strings with lengths up to 2,000,000 - characters. + version : {114, 117, 118, 119, None}, default 114 + Version to use in the output dta file. Set to None to let pandas + decide between 118 or 119 formats depending on the number of + columns in the frame. Version 114 can be read by Stata 10 and + later. Version 117 can be read by Stata 13 or later. Version 118 + is supported in Stata 14 and later. Version 119 is supported in + Stata 15 and later. Version 114 limits string variables to 244 + characters or fewer while versions 117 and later allow strings + with lengths up to 2,000,000 characters. Versions 118 and 119 + support Unicode characters, and version 119 supports more than + 32,767 variables. .. versionadded:: 0.23.0 + .. versionchanged:: 1.0.0 + + Added support for formats 118 and 119. convert_strl : list, optional List of column names to convert to string columns to Stata StrL @@ -1939,20 +1947,24 @@ def to_stata( ... 'speed': [350, 18, 361, 15]}) >>> df.to_stata('animals.dta') # doctest: +SKIP """ - kwargs = {} - if version not in (114, 117, 118): - raise ValueError("Only formats 114, 117 and 118 are supported.") + if version not in (114, 117, 118, 119, None): + raise ValueError("Only formats 114, 117, 118 and 119 are supported.") if version == 114: if convert_strl is not None: raise ValueError("strl is not supported in format 114") from pandas.io.stata import StataWriter as statawriter - else: - if version == 117: - from pandas.io.stata import StataWriter117 as statawriter - else: - from pandas.io.stata import StataWriter118 as statawriter + elif version == 117: + from pandas.io.stata import StataWriter117 as statawriter + else: # versions 118 and 119 + from pandas.io.stata import StataWriterUTF8 as statawriter + kwargs = {} + if version is None or version >= 117: + # strl conversion is only supported >= 117 kwargs["convert_strl"] = convert_strl + if version is None or version >= 118: + # Specifying the version is only supported for UTF8 (118 or 119) + kwargs["version"] = version writer = statawriter( path, diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 2c1222aad12cc..b8e04ad55dde1 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -15,7 +15,7 @@ import os import struct import sys -from typing import Any +from typing import Any, Dict, Hashable, Optional, Sequence import warnings from dateutil.relativedelta import relativedelta @@ -23,6 +23,7 @@ from pandas._libs.lib import infer_dtype from pandas._libs.writers import max_len_string_array +from pandas._typing import FilePathOrBuffer from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -47,9 +48,10 @@ from pandas.io.common import get_filepath_or_buffer, stringify_path _version_error = ( - "Version of given Stata file is not 104, 105, 108, " - "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), " - "115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)" + "Version of given Stata file is {version}. pandas supports importing " + "versions 104, 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), " + "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16)," + "and 119 (Stata 15/16, over 32,767 variables)." ) _statafile_processing_params1 = """\ @@ -1090,11 +1092,11 @@ def _read_header(self): self.col_sizes = [self._calcsize(typ) for typ in self.typlist] def _read_new_header(self, first_char): - # The first part of the header is common to 117 and 118. + # The first part of the header is common to 117 - 119. self.path_or_buf.read(27) # stata_dta>
    self.format_version = int(self.path_or_buf.read(3)) if self.format_version not in [117, 118, 119]: - raise ValueError(_version_error) + raise ValueError(_version_error.format(version=self.format_version)) self._set_encoding() self.path_or_buf.read(21) # self.byteorder = self.path_or_buf.read(3) == b"MSF" and ">" or "<" @@ -1287,7 +1289,7 @@ def _get_seek_variable_labels(self): def _read_old_header(self, first_char): self.format_version = struct.unpack("b", first_char)[0] if self.format_version not in [104, 105, 108, 111, 113, 114, 115]: - raise ValueError(_version_error) + raise ValueError(_version_error.format(version=self.format_version)) self._set_encoding() self.byteorder = ( struct.unpack("b", self.path_or_buf.read(1))[0] == 0x1 and ">" or "<" @@ -2695,7 +2697,7 @@ def _convert_key(self, key): def generate_table(self): """ - Generates the GSO lookup table for the DataFRame + Generates the GSO lookup table for the DataFrame Returns ------- @@ -2934,9 +2936,9 @@ def _write_header(self, data_label=None, time_stamp=None): bio.write(self._tag(bytes(str(self._dta_version), "utf-8"), "release")) # byteorder bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", "byteorder")) - # number of vars, 2 bytes - assert self.nvar < 2 ** 16 - bio.write(self._tag(struct.pack(byteorder + "H", self.nvar), "K")) + # number of vars, 2 bytes in 117 and 118, 4 byte in 119 + nvar_type = "H" if self._dta_version <= 118 else "I" + bio.write(self._tag(struct.pack(byteorder + nvar_type, self.nvar), "K")) # 117 uses 4 bytes, 118 uses 8 nobs_size = "I" if self._dta_version == 117 else "Q" bio.write(self._tag(struct.pack(byteorder + nobs_size, self.nobs), "N")) @@ -3033,7 +3035,8 @@ def _write_varnames(self): def _write_sortlist(self): self._update_map("sortlist") - self._file.write(self._tag(b"\x00\00" * (self.nvar + 1), "sortlist")) + sort_size = 2 if self._dta_version < 119 else 4 + self._file.write(self._tag(b"\x00" * sort_size * (self.nvar + 1), "sortlist")) def _write_formats(self): self._update_map("formats") @@ -3173,13 +3176,14 @@ def _set_formats_and_types(self, dtypes): ) -class StataWriter118(StataWriter117): +class StataWriterUTF8(StataWriter117): """ - A class for writing Stata binary dta files in Stata 15 format (118) + Stata binary dta file writing in Stata 15 (118) and 16 (119) formats - DTA 118 format files support unicode string data (both fixed and strL) - format. Unicode is also supported in value labels, variable labels and - the dataset label. + DTA 118 and 119 format files support unicode string data (both fixed + and strL) format. Unicode is also supported in value labels, variable + labels and the dataset label. Format 119 is automatically used if the + file contains more than 32,767 variables. .. versionadded:: 1.0.0 @@ -3192,34 +3196,38 @@ class StataWriter118(StataWriter117): is written. data : DataFrame Input to save - convert_dates : dict + convert_dates : dict, default None Dictionary mapping columns containing datetime types to stata internal format to use when writing the dates. Options are 'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. Datetime columns that do not have a conversion type specified will be converted to 'tc'. Raises NotImplementedError if a datetime column has timezone information - write_index : bool + write_index : bool, default True Write the index to Stata dataset. - byteorder : str + byteorder : str, default None Can be ">", "<", "little", or "big". default is `sys.byteorder` - time_stamp : datetime + time_stamp : datetime, default None A datetime to use as file creation date. Default is the current time - data_label : str + data_label : str, default None A label for the data set. Must be 80 characters or smaller. - variable_labels : dict + variable_labels : dict, default None Dictionary containing columns as keys and variable labels as values. Each label must be 80 characters or smaller. - convert_strl : list + convert_strl : list, default None List of columns names to convert to Stata StrL format. Columns with more than 2045 characters are automatically written as StrL. Smaller columns can be converted by including the column name. Using StrLs can reduce output file size when strings are longer than 8 characters, and either frequently repeated or sparse. + version : int, default None + The dta version to use. By default, uses the size of data to determine + the version. 118 is used if data.shape[1] <= 32767, and 119 is used + for storing larger DataFrames. Returns ------- - StataWriter118 + StataWriterUTF8 The instance has a write_file method, which will write the file to the given `fname`. @@ -3238,24 +3246,60 @@ class StataWriter118(StataWriter117): -------- Using Unicode data and column names - >>> from pandas.io.stata import StataWriter118 + >>> from pandas.io.stata import StataWriterUTF8 >>> data = pd.DataFrame([[1.0, 1, 'ᴬ']], columns=['a', 'β', 'ĉ']) - >>> writer = StataWriter118('./data_file.dta', data) + >>> writer = StataWriterUTF8('./data_file.dta', data) >>> writer.write_file() Or with long strings stored in strl format >>> data = pd.DataFrame([['ᴀ relatively long ŝtring'], [''], ['']], ... columns=['strls']) - >>> writer = StataWriter118('./data_file_with_long_strings.dta', data, - ... convert_strl=['strls']) + >>> writer = StataWriterUTF8('./data_file_with_long_strings.dta', data, + ... convert_strl=['strls']) >>> writer.write_file() """ _encoding = "utf-8" - _dta_version = 118 - def _validate_variable_name(self, name): + def __init__( + self, + fname: FilePathOrBuffer, + data: DataFrame, + convert_dates: Optional[Dict[Hashable, str]] = None, + write_index: bool = True, + byteorder: Optional[str] = None, + time_stamp: Optional[datetime.datetime] = None, + data_label: Optional[str] = None, + variable_labels: Optional[Dict[Hashable, str]] = None, + convert_strl: Optional[Sequence[Hashable]] = None, + version: Optional[int] = None, + ): + if version is None: + version = 118 if data.shape[1] <= 32767 else 119 + elif version not in (118, 119): + raise ValueError("version must be either 118 or 119.") + elif version == 118 and data.shape[1] > 32767: + raise ValueError( + "You must use version 119 for data sets containing more than" + "32,767 variables" + ) + + super().__init__( + fname, + data, + convert_dates=convert_dates, + write_index=write_index, + byteorder=byteorder, + time_stamp=time_stamp, + data_label=data_label, + variable_labels=variable_labels, + convert_strl=convert_strl, + ) + # Override version set in StataWriter117 init + self._dta_version = version + + def _validate_variable_name(self, name: str) -> str: """ Validate variable names for Stata export. @@ -3272,7 +3316,7 @@ def _validate_variable_name(self, name): Notes ----- - Stata 118 support most unicode characters. The only limatation is in + Stata 118+ support most unicode characters. The only limitation is in the ascii range where the characters supported are a-z, A-Z, 0-9 and _. """ # High code points appear to be acceptable diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 5e4ec116645b0..edb766a67af89 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -21,11 +21,22 @@ PossiblePrecisionLoss, StataMissingValue, StataReader, - StataWriter118, + StataWriterUTF8, read_stata, ) +@pytest.fixture() +def mixed_frame(): + return pd.DataFrame( + { + "a": [1, 2, 3, 4], + "b": [1.0, 3.0, 27.0, 81.0], + "c": ["Atlanta", "Birmingham", "Cincinnati", "Detroit"], + } + ) + + @pytest.fixture def dirpath(datapath): return datapath("io", "data", "stata") @@ -112,7 +123,7 @@ def read_dta(self, file): def read_csv(self, file): return read_csv(file, parse_dates=True) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_read_empty_dta(self, version): empty_ds = DataFrame(columns=["unit"]) # GH 7369, make sure can read a 0-obs dta file @@ -332,7 +343,7 @@ def test_write_dta6(self): check_index_type=False, ) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_read_write_dta10(self, version): original = DataFrame( data=[["string", "object", 1, 1.1, np.datetime64("2003-12-25")]], @@ -368,7 +379,7 @@ def test_write_preserves_original(self): df.to_stata(path, write_index=False) tm.assert_frame_equal(df, df_copy) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_encoding(self, version): # GH 4626, proper encoding handling @@ -409,7 +420,7 @@ def test_read_write_dta11(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_read_write_dta12(self, version): original = DataFrame( [(1, 2, 3, 4, 5, 6)], @@ -461,7 +472,7 @@ def test_read_write_dta13(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) @pytest.mark.parametrize( "file", ["dta14_113", "dta14_114", "dta14_115", "dta14_117"] ) @@ -504,7 +515,7 @@ def test_read_write_reread_dta15(self, file): tm.assert_frame_equal(expected, parsed) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_timestamp_and_label(self, version): original = DataFrame([(1,)], columns=["variable"]) time_stamp = datetime(2000, 2, 29, 14, 21) @@ -518,7 +529,7 @@ def test_timestamp_and_label(self, version): assert reader.time_stamp == "29 Feb 2000 14:21" assert reader.data_label == data_label - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_invalid_timestamp(self, version): original = DataFrame([(1,)], columns=["variable"]) time_stamp = "01 Jan 2000, 00:00:00" @@ -542,7 +553,7 @@ def test_numeric_column_names(self): written_and_read_again.columns = map(convert_col_name, columns) tm.assert_frame_equal(original, written_and_read_again) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_nan_to_missing_value(self, version): s1 = Series(np.arange(4.0), dtype=np.float32) s2 = Series(np.arange(4.0), dtype=np.float64) @@ -662,7 +673,7 @@ def test_write_missing_strings(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) @pytest.mark.parametrize("byteorder", [">", "<"]) def test_bool_uint(self, byteorder, version): s0 = Series([0, 1, True], dtype=np.bool) @@ -908,7 +919,7 @@ def test_drop_column(self): columns = ["byte_", "int_", "long_", "not_found"] read_stata(self.dta15_117, convert_dates=True, columns=columns) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) @pytest.mark.filterwarnings( "ignore:\\nStata value:pandas.io.stata.ValueLabelTypeMismatch" ) @@ -985,7 +996,7 @@ def test_categorical_warnings_and_errors(self): original.to_stata(path) # should get a warning for mixed content - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_categorical_with_stata_missing_values(self, version): values = [["a" + str(i)] for i in range(120)] values.append([np.nan]) @@ -1221,20 +1232,13 @@ def test_read_chunks_columns(self): tm.assert_frame_equal(from_frame, chunk, check_dtype=False) pos += chunksize - @pytest.mark.parametrize("version", [114, 117]) - def test_write_variable_labels(self, version): + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) + def test_write_variable_labels(self, version, mixed_frame): # GH 13631, add support for writing variable labels - original = pd.DataFrame( - { - "a": [1, 2, 3, 4], - "b": [1.0, 3.0, 27.0, 81.0], - "c": ["Atlanta", "Birmingham", "Cincinnati", "Detroit"], - } - ) - original.index.name = "index" + mixed_frame.index.name = "index" variable_labels = {"a": "City Rank", "b": "City Exponent", "c": "City"} with tm.ensure_clean() as path: - original.to_stata(path, variable_labels=variable_labels, version=version) + mixed_frame.to_stata(path, variable_labels=variable_labels, version=version) with StataReader(path) as sr: read_labels = sr.variable_labels() expected_labels = { @@ -1247,46 +1251,36 @@ def test_write_variable_labels(self, version): variable_labels["index"] = "The Index" with tm.ensure_clean() as path: - original.to_stata(path, variable_labels=variable_labels, version=version) + mixed_frame.to_stata(path, variable_labels=variable_labels, version=version) with StataReader(path) as sr: read_labels = sr.variable_labels() assert read_labels == variable_labels - @pytest.mark.parametrize("version", [114, 117]) - def test_invalid_variable_labels(self, version): - original = pd.DataFrame( - { - "a": [1, 2, 3, 4], - "b": [1.0, 3.0, 27.0, 81.0], - "c": ["Atlanta", "Birmingham", "Cincinnati", "Detroit"], - } - ) - original.index.name = "index" + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) + def test_invalid_variable_labels(self, version, mixed_frame): + mixed_frame.index.name = "index" variable_labels = {"a": "very long" * 10, "b": "City Exponent", "c": "City"} with tm.ensure_clean() as path: msg = "Variable labels must be 80 characters or fewer" with pytest.raises(ValueError, match=msg): - original.to_stata( + mixed_frame.to_stata( path, variable_labels=variable_labels, version=version ) + @pytest.mark.parametrize("version", [114, 117]) + def test_invalid_variable_label_encoding(self, version, mixed_frame): + mixed_frame.index.name = "index" + variable_labels = {"a": "very long" * 10, "b": "City Exponent", "c": "City"} variable_labels["a"] = "invalid character Œ" with tm.ensure_clean() as path: with pytest.raises( ValueError, match="Variable labels must contain only characters" ): - original.to_stata( + mixed_frame.to_stata( path, variable_labels=variable_labels, version=version ) - def test_write_variable_label_errors(self): - original = pd.DataFrame( - { - "a": [1, 2, 3, 4], - "b": [1.0, 3.0, 27.0, 81.0], - "c": ["Atlanta", "Birmingham", "Cincinnati", "Detroit"], - } - ) + def test_write_variable_label_errors(self, mixed_frame): values = ["\u03A1", "\u0391", "\u039D", "\u0394", "\u0391", "\u03A3"] variable_labels_utf8 = { @@ -1301,7 +1295,7 @@ def test_write_variable_label_errors(self): ) with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: - original.to_stata(path, variable_labels=variable_labels_utf8) + mixed_frame.to_stata(path, variable_labels=variable_labels_utf8) variable_labels_long = { "a": "City Rank", @@ -1314,7 +1308,7 @@ def test_write_variable_label_errors(self): msg = "Variable labels must be 80 characters or fewer" with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: - original.to_stata(path, variable_labels=variable_labels_long) + mixed_frame.to_stata(path, variable_labels=variable_labels_long) def test_default_date_conversion(self): # GH 12259 @@ -1636,7 +1630,7 @@ def test_invalid_date_conversion(self): with pytest.raises(ValueError, match=msg): original.to_stata(path, convert_dates={"wrong_name": "tc"}) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_nonfile_writing(self, version): # GH 21041 bio = io.BytesIO() @@ -1699,7 +1693,7 @@ def test_mixed_string_strl(self): expected = output.fillna("") tm.assert_frame_equal(reread, expected) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_all_none_exception(self, version): output = [{"none": "none", "number": 0}, {"none": None, "number": 1}] output = pd.DataFrame(output) @@ -1708,7 +1702,7 @@ def test_all_none_exception(self, version): with pytest.raises(ValueError, match="Column `none` cannot be exported"): output.to_stata(path, version=version) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_invalid_file_not_written(self, version): content = "Here is one __�__ Another one __·__ Another one __½__" df = DataFrame([content], columns=["invalid"]) @@ -1770,7 +1764,8 @@ def test_stata_119(self): assert df.iloc[0, -1] == 1 assert df.iloc[0, 0] == pd.Timestamp(datetime(2012, 12, 21, 21, 12, 21)) - def test_118_writer(self): + @pytest.mark.parametrize("version", [118, 119, None]) + def test_utf8_writer(self, version): cat = pd.Categorical(["a", "β", "ĉ"], ordered=True) data = pd.DataFrame( [ @@ -1791,13 +1786,14 @@ def test_118_writer(self): data_label = "ᴅaᵀa-label" data["β"] = data["β"].astype(np.int32) with tm.ensure_clean() as path: - writer = StataWriter118( + writer = StataWriterUTF8( path, data, data_label=data_label, convert_strl=["strls"], variable_labels=variable_labels, write_index=False, + version=version, ) writer.write_file() reread_encoded = read_stata(path) @@ -1807,3 +1803,16 @@ def test_118_writer(self): reader = StataReader(path) assert reader.data_label == data_label assert reader.variable_labels() == variable_labels + + data.to_stata(path, version=version, write_index=False) + reread_to_stata = read_stata(path) + tm.assert_frame_equal(data, reread_to_stata) + + def test_writer_118_exceptions(self): + df = DataFrame(np.zeros((1, 33000), dtype=np.int8)) + with tm.ensure_clean() as path: + with pytest.raises(ValueError, match="version must be either 118 or 119."): + StataWriterUTF8(path, df, version=117) + with tm.ensure_clean() as path: + with pytest.raises(ValueError, match="You must use version 119"): + StataWriterUTF8(path, df, version=118) From 183506420759475dfe6765c9ff78ee92640704cf Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Tue, 14 Jan 2020 22:18:41 +0200 Subject: [PATCH 039/158] TST: bare pytest raises (#31001) --- pandas/tests/window/test_rolling.py | 36 ++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 04fab93b71c4a..ff435f8386a85 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -32,23 +32,34 @@ def test_constructor(self, which): c = o.rolling # valid + c(0) c(window=2) c(window=2, min_periods=1) c(window=2, min_periods=1, center=True) c(window=2, min_periods=1, center=False) # GH 13383 - with pytest.raises(ValueError): - c(0) + + msg = "window must be non-negative" + + with pytest.raises(ValueError, match=msg): c(-1) # not valid for w in [2.0, "foo", np.array([2])]: - with pytest.raises(ValueError): + msg = ( + "window must be an integer|" + "passed window foo is not compatible with a datetimelike index" + ) + with pytest.raises(ValueError, match=msg): c(window=w) - with pytest.raises(ValueError): + + msg = "min_periods must be an integer" + with pytest.raises(ValueError, match=msg): c(window=2, min_periods=w) - with pytest.raises(ValueError): + + msg = "center must be a boolean" + with pytest.raises(ValueError, match=msg): c(window=2, min_periods=1, center=w) @td.skip_if_no_scipy @@ -57,7 +68,10 @@ def test_constructor_with_win_type(self, which): # GH 13383 o = getattr(self, which) c = o.rolling - with pytest.raises(ValueError): + + msg = "window must be > 0" + + with pytest.raises(ValueError, match=msg): c(-1, win_type="boxcar") @pytest.mark.parametrize("window", [timedelta(days=3), pd.Timedelta(days=3)]) @@ -113,7 +127,10 @@ def test_numpy_compat(self, method): def test_closed(self): df = DataFrame({"A": [0, 1, 2, 3, 4]}) # closed only allowed for datetimelike - with pytest.raises(ValueError): + + msg = "closed only implemented for datetimelike and offset based windows" + + with pytest.raises(ValueError, match=msg): df.rolling(window=3, closed="neither") @pytest.mark.parametrize("closed", ["neither", "left"]) @@ -296,7 +313,10 @@ def test_iter_raises(self, klass): # https://github.com/pandas-dev/pandas/issues/11704 # Iteration over a Window obj = klass([1, 2, 3, 4]) - with pytest.raises(NotImplementedError): + + msg = "See issue #11704 https://github.com/pandas-dev/pandas/issues/11704" + + with pytest.raises(NotImplementedError, match=msg): iter(obj.rolling(2)) def test_rolling_axis_sum(self, axis_frame): From ee8398844665d553e661e1413abfbfea0e121b0c Mon Sep 17 00:00:00 2001 From: tonywu1999 Date: Tue, 14 Jan 2020 15:20:50 -0500 Subject: [PATCH 040/158] DOC: Changed links to sphinx documentation in contributing.rst file (GH31010) (#31011) --- doc/source/development/contributing.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 2dc5ed07544d1..4fdcb93745094 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -354,9 +354,9 @@ About the *pandas* documentation -------------------------------- The documentation is written in **reStructuredText**, which is almost like writing -in plain English, and built using `Sphinx `__. The +in plain English, and built using `Sphinx `__. The Sphinx Documentation has an excellent `introduction to reST -`__. Review the Sphinx docs to perform more +`__. Review the Sphinx docs to perform more complex changes to the documentation as well. Some other important things to know about the docs: From 8d093fe4b84ec9c8ab0f54d499e83acaf7303e7d Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Tue, 14 Jan 2020 22:31:47 +0200 Subject: [PATCH 041/158] TST: Insert 'match' to bare pytest raises (#30997) --- pandas/tests/test_common.py | 13 ++++++++----- pandas/tests/test_downstream.py | 7 ++++++- pandas/tests/test_errors.py | 11 +++++++---- pandas/tests/test_lib.py | 13 ++++++++----- pandas/tests/test_take.py | 17 +++++++++++++---- 5 files changed, 42 insertions(+), 19 deletions(-) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index a8a0fcea7182c..186c735a0bff9 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -60,10 +60,11 @@ def test_random_state(): assert com.random_state() is np.random # Error for floats or strings - with pytest.raises(ValueError): + msg = "random_state must be an integer, a numpy RandomState, or None" + with pytest.raises(ValueError, match=msg): com.random_state("test") - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): com.random_state(5.5) @@ -93,15 +94,17 @@ def test_dict_compat(): def test_standardize_mapping(): # No uninitialized defaultdicts - with pytest.raises(TypeError): + msg = r"to_dict\(\) only accepts initialized defaultdicts" + with pytest.raises(TypeError, match=msg): com.standardize_mapping(collections.defaultdict) # No non-mapping subtypes, instance - with pytest.raises(TypeError): + msg = "unsupported type: " + with pytest.raises(TypeError, match=msg): com.standardize_mapping([]) # No non-mapping subtypes, class - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): com.standardize_mapping(list) fill = {"bad": "data"} diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 8edd9f20ec63c..02898988ca8aa 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -136,7 +136,12 @@ def test_missing_required_dependency(): # https://github.com/MacPython/pandas-wheels/pull/50 call = ["python", "-sSE", "-c", "import pandas"] - with pytest.raises(subprocess.CalledProcessError) as exc: + msg = ( + r"Command '\['python', '-sSE', '-c', 'import pandas'\]' " + "returned non-zero exit status 1." + ) + + with pytest.raises(subprocess.CalledProcessError, match=msg) as exc: subprocess.check_output(call, stderr=subprocess.STDOUT) output = exc.value.stdout.decode() diff --git a/pandas/tests/test_errors.py b/pandas/tests/test_errors.py index fa2142444ed92..939ea8a64d94d 100644 --- a/pandas/tests/test_errors.py +++ b/pandas/tests/test_errors.py @@ -22,12 +22,15 @@ def test_exception_importable(exc): from pandas import errors - e = getattr(errors, exc) - assert e is not None + err = getattr(errors, exc) + assert err is not None # check that we can raise on them - with pytest.raises(e): - raise e() + + msg = "^$" + + with pytest.raises(err, match=msg): + raise err() def test_catch_oob(): diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index f839aa198d03f..d914cf873de24 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -22,7 +22,8 @@ def test_max_len_string_array(self): assert libwriters.max_len_string_array(arr) == 3 # raises - with pytest.raises(TypeError): + msg = "No matching signature found" + with pytest.raises(TypeError, match=msg): libwriters.max_len_string_array(arr.astype("U")) def test_fast_unique_multiple_list_gen_sort(self): @@ -100,9 +101,11 @@ def test_maybe_indices_to_slice_right_edge(self): assert not isinstance(maybe_slice, slice) tm.assert_numpy_array_equal(maybe_slice, indices) - with pytest.raises(IndexError): + msg = "index 100 is out of bounds for axis (0|1) with size 100" + + with pytest.raises(IndexError, match=msg): target[indices] - with pytest.raises(IndexError): + with pytest.raises(IndexError, match=msg): target[maybe_slice] indices = np.array([100, 99, 98, 97], dtype=np.int64) @@ -111,9 +114,9 @@ def test_maybe_indices_to_slice_right_edge(self): assert not isinstance(maybe_slice, slice) tm.assert_numpy_array_equal(maybe_slice, indices) - with pytest.raises(IndexError): + with pytest.raises(IndexError, match=msg): target[indices] - with pytest.raises(IndexError): + with pytest.raises(IndexError, match=msg): target[maybe_slice] for case in [[99, 97, 99, 96], [99, 99, 98, 97], [98, 98, 97, 96]]: diff --git a/pandas/tests/test_take.py b/pandas/tests/test_take.py index 465296a6f9e51..1cd5f11057464 100644 --- a/pandas/tests/test_take.py +++ b/pandas/tests/test_take.py @@ -423,16 +423,21 @@ class TestExtensionTake: def test_bounds_check_large(self): arr = np.array([1, 2]) - with pytest.raises(IndexError): + + msg = "indices are out-of-bounds" + with pytest.raises(IndexError, match=msg): algos.take(arr, [2, 3], allow_fill=True) - with pytest.raises(IndexError): + msg = "index 2 is out of bounds for size 2" + with pytest.raises(IndexError, match=msg): algos.take(arr, [2, 3], allow_fill=False) def test_bounds_check_small(self): arr = np.array([1, 2, 3], dtype=np.int64) indexer = [0, -1, -2] - with pytest.raises(ValueError): + + msg = r"'indices' contains values less than allowed \(-2 < -1\)" + with pytest.raises(ValueError, match=msg): algos.take(arr, indexer, allow_fill=True) result = algos.take(arr, indexer) @@ -446,7 +451,11 @@ def test_take_empty(self, allow_fill): result = algos.take(arr, [], allow_fill=allow_fill) tm.assert_numpy_array_equal(arr, result) - with pytest.raises(IndexError): + msg = ( + r"cannot do a non-empty take from an empty axes.|" + "indices are out-of-bounds" + ) + with pytest.raises(IndexError, match=msg): algos.take(arr, [0], allow_fill=allow_fill) def test_take_na_empty(self): From 37871335291154f3e2c46b94c9a6157c15e37275 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 14 Jan 2020 15:33:14 -0600 Subject: [PATCH 042/158] DOC: Fixed documented value of `pd.NA ** 0` (#31005) (#31015) Co-authored-by: tsvikas --- doc/source/user_guide/missing_data.rst | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index df9949e8ac261..0f55980b3d015 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -825,13 +825,10 @@ For example, ``pd.NA`` propagates in arithmetic operations, similarly to There are a few special cases when the result is known, even when one of the operands is ``NA``. +.. ipython:: python -================ ====== -Operation Result -================ ====== -``pd.NA ** 0`` 0 -``1 ** pd.NA`` 1 -================ ====== + pd.NA ** 0 + 1 ** pd.NA In equality and comparison operations, ``pd.NA`` also propagates. This deviates from the behaviour of ``np.nan``, where comparisons with ``np.nan`` always From cb05c13b226507b5b321c5216fe8b7b0f8952c8d Mon Sep 17 00:00:00 2001 From: Anthony Milbourne <18662115+amilbourne@users.noreply.github.com> Date: Tue, 14 Jan 2020 23:24:59 +0000 Subject: [PATCH 043/158] DOC: Clarified documentation for convert_dates and use_default_dates params (#30949) --- pandas/io/json/_json.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 12ce5e4a62d24..ae6ae70cbac72 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -439,8 +439,17 @@ def read_json( Not applicable for ``orient='table'``. convert_dates : bool or list of str, default True - List of columns to parse for dates. If True, then try to parse - datelike columns. A column label is datelike if + If True then default datelike columns may be converted (depending on + keep_default_dates). + If False, no dates will be converted. + If a list of column names, then those columns will be converted and + default datelike columns may also be converted (depending on + keep_default_dates). + + keep_default_dates : bool, default True + If parsing dates (convert_dates is not False), then try to parse the + default datelike columns. + A column label is datelike if * it ends with ``'_at'``, @@ -452,9 +461,6 @@ def read_json( * it is ``'date'``. - keep_default_dates : bool, default True - If parsing dates, then parse the default datelike columns. - numpy : bool, default False Direct decoding to numpy arrays. Supports numeric data only, but non-numeric column and index labels are supported. Note also that the From 6b87716e1416061b4bf0109a89caf962acdf8c12 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Wed, 15 Jan 2020 00:36:15 +0000 Subject: [PATCH 044/158] CI: Remove unrequired config (#31024) --- .travis.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index a11cd469e9b9c..c24c6f06de1fc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -25,9 +25,6 @@ git: matrix: fast_finish: true - exclude: - # Exclude the default Python 3.5 build - - python: 3.5 include: - env: From 6837794a9af034cd7e5807e627eb21e52b68edc6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 14 Jan 2020 20:56:35 -0600 Subject: [PATCH 045/158] BUG: Preserve string dtype in extract (#31018) specifically with multiple capture groups and expand=False Closes https://github.com/pandas-dev/pandas/issues/30969 --- pandas/core/strings.py | 3 ++- pandas/tests/test_strings.py | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 0323eafff8dee..4bcf2943e3d6e 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -884,11 +884,12 @@ def _str_extract_noexpand(arr, pat, flags=0): if arr.empty: result = DataFrame(columns=columns, dtype=object) else: + dtype = _result_dtype(arr) result = DataFrame( [groups_or_na(val) for val in arr], columns=columns, index=arr.index, - dtype=object, + dtype=dtype, ) return result, name diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index c37c78f3b9235..62d26dacde67b 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3573,3 +3573,18 @@ def test_string_array_boolean_array(method, expected): result = getattr(s.str, method)() expected = Series(expected, dtype="boolean") tm.assert_series_equal(result, expected) + + +def test_string_array_extract(): + # https://github.com/pandas-dev/pandas/issues/30969 + # Only expand=False & multiple groups was failing + a = Series(["a1", "b2", "cc"], dtype="string") + b = Series(["a1", "b2", "cc"], dtype="object") + pat = r"(\w)(\d)" + + result = a.str.extract(pat, expand=False) + expected = b.str.extract(pat, expand=False) + assert all(result.dtypes == "string") + + result = result.astype(object) + tm.assert_equal(result, expected) From 51cb276e623613991f94c4ef3f1ac5c5f5327655 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 14 Jan 2020 18:58:06 -0800 Subject: [PATCH 046/158] CLN: remove checks for inferred_dtype==unicode (#31020) --- pandas/_testing.py | 4 ++-- pandas/core/algorithms.py | 2 +- pandas/core/dtypes/cast.py | 2 +- pandas/core/indexes/base.py | 3 +-- pandas/io/parquet.py | 2 +- pandas/io/parsers.py | 2 +- pandas/io/stata.py | 2 +- 7 files changed, 8 insertions(+), 9 deletions(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index 1fdc5d478aaf6..018551224c582 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -613,8 +613,8 @@ def _check_types(l, r, obj="Index"): assert_attr_equal("dtype", l, r, obj=obj) # allow string-like to have different inferred_types - if l.inferred_type in ("string", "unicode"): - assert r.inferred_type in ("string", "unicode") + if l.inferred_type in ("string"): + assert r.inferred_type in ("string") else: assert_attr_equal("inferred_type", l, r, obj=obj) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 39e8e9008a844..59256f6924b79 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -201,7 +201,7 @@ def _ensure_arraylike(values): """ if not is_array_like(values): inferred = lib.infer_dtype(values, skipna=False) - if inferred in ["mixed", "string", "unicode"]: + if inferred in ["mixed", "string"]: if isinstance(values, tuple): values = list(values) values = construct_1d_object_array_from_listlike(values) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 1dbdb8dbba48b..2a09bd7e54a8e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -670,7 +670,7 @@ def infer_dtype_from_array(arr, pandas_dtype: bool = False): # don't force numpy coerce with nan's inferred = lib.infer_dtype(arr, skipna=False) - if inferred in ["string", "bytes", "unicode", "mixed", "mixed-integer"]: + if inferred in ["string", "bytes", "mixed", "mixed-integer"]: return (np.object_, arr) arr = np.asarray(arr) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f2f53f564da76..47daaa4958411 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -910,7 +910,7 @@ def _format_data(self, name=None): # do we want to justify (only do so for non-objects) is_justify = not ( - self.inferred_type in ("string", "unicode") + self.inferred_type in ("string") or ( self.inferred_type == "categorical" and is_object_dtype(self.categories) ) @@ -2860,7 +2860,6 @@ def _convert_scalar_indexer(self, key, kind=None): "mixed-integer-float", "integer-na", "string", - "unicode", "mixed", ]: self._invalid_indexer("label", key) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 4be62b886f076..98f2eb3929b59 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -51,7 +51,7 @@ def validate_dataframe(df: DataFrame): raise ValueError("to_parquet only supports IO with DataFrames") # must have value column names (strings only) - if df.columns.inferred_type not in {"string", "unicode", "empty"}: + if df.columns.inferred_type not in {"string", "empty"}: raise ValueError("parquet must have string column names") # index level names must be strings diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 41db6ed0ef503..84a8b5b2a94fe 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1304,7 +1304,7 @@ def _validate_usecols_arg(usecols): usecols_dtype = lib.infer_dtype(usecols, skipna=False) - if usecols_dtype not in ("empty", "integer", "string", "unicode"): + if usecols_dtype not in ("empty", "integer", "string"): raise ValueError(msg) usecols = set(usecols) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index b8e04ad55dde1..cee5f3d280991 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2334,7 +2334,7 @@ def _encode_strings(self): dtype = column.dtype if dtype.type == np.object_: inferred_dtype = infer_dtype(column, skipna=True) - if not ((inferred_dtype in ("string", "unicode")) or len(column) == 0): + if not ((inferred_dtype in ("string")) or len(column) == 0): col = column.name raise ValueError( f"""\ From 4ed9a3904abe6a92d1f6616a44adedbaf41af11f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 14 Jan 2020 18:59:59 -0800 Subject: [PATCH 047/158] CLN: remove unused NDFrame methods (#30935) --- pandas/core/frame.py | 47 ++++++++++-- pandas/core/generic.py | 111 --------------------------- pandas/core/reshape/concat.py | 4 +- pandas/tests/generic/test_generic.py | 3 - 4 files changed, 41 insertions(+), 124 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 42dc21156ba59..6dd3a415297db 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4942,19 +4942,52 @@ def sort_values( else: return self._constructor(new_data).__finalize__(self) - @Substitution(**_shared_doc_kwargs) - @Appender(NDFrame.sort_index.__doc__) def sort_index( self, axis=0, level=None, - ascending=True, - inplace=False, - kind="quicksort", - na_position="last", - sort_remaining=True, + ascending: bool = True, + inplace: bool = False, + kind: str = "quicksort", + na_position: str = "last", + sort_remaining: bool = True, ignore_index: bool = False, ): + """ + Sort object by labels (along an axis). + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis along which to sort. The value 0 identifies the rows, + and 1 identifies the columns. + level : int or level name or list of ints or list of level names + If not None, sort on values in specified index level(s). + ascending : bool, default True + Sort ascending vs. descending. + inplace : bool, default False + If True, perform operation in-place. + kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort' + Choice of sorting algorithm. See also ndarray.np.sort for more + information. `mergesort` is the only stable algorithm. For + DataFrames, this option is only applied when sorting on a single + column or label. + na_position : {'first', 'last'}, default 'last' + Puts NaNs at the beginning if `first`; `last` puts NaNs at the end. + Not implemented for MultiIndex. + sort_remaining : bool, default True + If True and sorting by level and index is multilevel, sort by other + levels too (in order) after sorting by specified level. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.0.0 + + Returns + ------- + sorted_obj : DataFrame or None + DataFrame with sorted index if inplace=False, None otherwise. + """ # TODO: this can be combined with Series.sort_index impl as # almost identical diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c501ada6b5783..0c413cd473bbc 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -350,13 +350,6 @@ def _construct_axes_dict(self, axes=None, **kwargs): d.update(kwargs) return d - @staticmethod - def _construct_axes_dict_from(self, axes, **kwargs): - """Return an axes dictionary for the passed axes.""" - d = {a: ax for a, ax in zip(self._AXIS_ORDERS, axes)} - d.update(kwargs) - return d - def _construct_axes_from_arguments( self, args, kwargs, require_all: bool = False, sentinel=None ): @@ -385,18 +378,6 @@ def _construct_axes_from_arguments( axes = {a: kwargs.pop(a, sentinel) for a in self._AXIS_ORDERS} return axes, kwargs - @classmethod - def _from_axes(cls: Type[FrameOrSeries], data, axes, **kwargs) -> FrameOrSeries: - # for construction from BlockManager - if isinstance(data, BlockManager): - return cls(data, **kwargs) - else: - if cls._AXIS_REVERSED: - axes = axes[::-1] - d = cls._construct_axes_dict_from(cls, axes, copy=False) - d.update(kwargs) - return cls(data, **d) - @classmethod def _get_axis_number(cls, axis): axis = cls._AXIS_ALIASES.get(axis, axis) @@ -911,25 +892,6 @@ def squeeze(self, axis=None): ) ] - def swaplevel(self: FrameOrSeries, i=-2, j=-1, axis=0) -> FrameOrSeries: - """ - Swap levels i and j in a MultiIndex on a particular axis - - Parameters - ---------- - i, j : int, str (can be mixed) - Level of index to be swapped. Can pass level name as string. - - Returns - ------- - swapped : same type as caller (new object) - """ - axis = self._get_axis_number(axis) - result = self.copy() - labels = result._data.axes[axis] - result._data.set_axis(axis, labels.swaplevel(i, j)) - return result - # ---------------------------------------------------------------------- # Rename @@ -4224,69 +4186,6 @@ def sort_values( """ raise AbstractMethodError(self) - def sort_index( - self, - axis=0, - level=None, - ascending: bool_t = True, - inplace: bool_t = False, - kind: str = "quicksort", - na_position: str = "last", - sort_remaining: bool_t = True, - ignore_index: bool_t = False, - ): - """ - Sort object by labels (along an axis). - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis along which to sort. The value 0 identifies the rows, - and 1 identifies the columns. - level : int or level name or list of ints or list of level names - If not None, sort on values in specified index level(s). - ascending : bool, default True - Sort ascending vs. descending. - inplace : bool, default False - If True, perform operation in-place. - kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort' - Choice of sorting algorithm. See also ndarray.np.sort for more - information. `mergesort` is the only stable algorithm. For - DataFrames, this option is only applied when sorting on a single - column or label. - na_position : {'first', 'last'}, default 'last' - Puts NaNs at the beginning if `first`; `last` puts NaNs at the end. - Not implemented for MultiIndex. - sort_remaining : bool, default True - If True and sorting by level and index is multilevel, sort by other - levels too (in order) after sorting by specified level. - ignore_index : bool, default False - If True, the resulting axis will be labeled 0, 1, …, n - 1. - - .. versionadded:: 1.0.0 - - Returns - ------- - sorted_obj : DataFrame or None - DataFrame with sorted index if inplace=False, None otherwise. - """ - inplace = validate_bool_kwarg(inplace, "inplace") - axis = self._get_axis_number(axis) - axis_name = self._get_axis_name(axis) - labels = self._get_axis(axis) - - if level is not None: - raise NotImplementedError("level is not implemented") - if inplace: - raise NotImplementedError("inplace is not implemented") - - sort_index = labels.argsort() - if not ascending: - sort_index = sort_index[::-1] - - new_axis = labels.take(sort_index) - return self.reindex(**{axis_name: new_axis}) - def reindex(self: FrameOrSeries, *args, **kwargs) -> FrameOrSeries: """ Conform %(klass)s to new index with optional filling logic. @@ -5369,11 +5268,6 @@ def _is_numeric_mixed_type(self): f = lambda: self._data.is_numeric_mixed_type return self._protect_consolidate(f) - @property - def _is_datelike_mixed_type(self): - f = lambda: self._data.is_datelike_mixed_type - return self._protect_consolidate(f) - def _check_inplace_setting(self, value) -> bool_t: """ check whether we allow in-place setting with this type of value """ @@ -5482,11 +5376,6 @@ def _values(self) -> np.ndarray: """internal implementation""" return self.values - @property - def _get_values(self) -> np.ndarray: - # compat - return self.values - def _internal_get_values(self) -> np.ndarray: """ Return an ndarray after converting sparse values to dense. diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 449f70b2be2fd..9528de36a3664 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -499,9 +499,7 @@ def get_result(self): new_data._consolidate_inplace() cons = self.objs[0]._constructor - return cons._from_axes(new_data, self.new_axes).__finalize__( - self, method="concat" - ) + return cons(new_data).__finalize__(self, method="concat") def _get_result_dim(self) -> int: if self._is_series and self.axis == 1: diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 10a1e09a09bf8..efb04c7f63c66 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -548,9 +548,6 @@ def test_validate_bool_args(self): with pytest.raises(ValueError): super(DataFrame, df).drop("a", axis=1, inplace=value) - with pytest.raises(ValueError): - super(DataFrame, df).sort_index(inplace=value) - with pytest.raises(ValueError): super(DataFrame, df)._consolidate(inplace=value) From bea57898922c65647d04d900e492e8b1d5597b75 Mon Sep 17 00:00:00 2001 From: Simon Legner Date: Wed, 15 Jan 2020 04:04:47 +0100 Subject: [PATCH 048/158] doc: update copyright year (#31022) --- doc/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 481c03ab8f388..c6786a03f0e44 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -137,7 +137,7 @@ # General information about the project. project = "pandas" -copyright = "2008-2014, the pandas development team" +copyright = "2008-2020, the pandas development team" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the From d243ca05e17ce31f2d228df35c7f01d77e23a29d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 14 Jan 2020 19:08:51 -0800 Subject: [PATCH 049/158] REF: PeriodIndex.get_loc (#31021) --- pandas/_libs/index.pyx | 2 +- pandas/core/indexes/period.py | 121 +++++++++++-------- pandas/tests/indexes/period/test_indexing.py | 22 ++++ 3 files changed, 92 insertions(+), 53 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index ce6d12d61c521..07e8534b84c50 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -498,7 +498,7 @@ cdef class TimedeltaEngine(DatetimeEngine): cdef class PeriodEngine(Int64Engine): cdef _get_index_values(self): - return super(PeriodEngine, self).vgetter() + return super(PeriodEngine, self).vgetter().view("i8") cdef void _call_map_locations(self, values): # super(...) pattern doesn't seem to work with `cdef` diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 4e3689078d535..123353b620bfa 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -4,7 +4,7 @@ import numpy as np from pandas._libs import index as libindex -from pandas._libs.tslibs import NaT, frequencies as libfrequencies, iNaT, resolution +from pandas._libs.tslibs import NaT, frequencies as libfrequencies, resolution from pandas._libs.tslibs.period import Period from pandas.util._decorators import Appender, Substitution, cache_readonly @@ -17,6 +17,7 @@ is_float_dtype, is_integer, is_integer_dtype, + is_list_like, is_object_dtype, pandas_dtype, ) @@ -42,7 +43,6 @@ ) from pandas.core.indexes.datetimes import DatetimeIndex, Index from pandas.core.indexes.numeric import Int64Index -from pandas.core.missing import isna from pandas.core.ops import get_op_result_name from pandas.core.tools.datetimes import DateParseError, parse_time_string @@ -507,42 +507,43 @@ def get_value(self, series, key): Fast lookup of value from 1-dimensional ndarray. Only use this if you know what you're doing """ - s = com.values_from_object(series) - try: - value = super().get_value(s, key) - except (KeyError, IndexError): - if isinstance(key, str): - asdt, parsed, reso = parse_time_string(key, self.freq) - grp = resolution.Resolution.get_freq_group(reso) - freqn = resolution.get_freq_group(self.freq) - - vals = self._ndarray_values - - # if our data is higher resolution than requested key, slice - if grp < freqn: - iv = Period(asdt, freq=(grp, 1)) - ord1 = iv.asfreq(self.freq, how="S").ordinal - ord2 = iv.asfreq(self.freq, how="E").ordinal - - if ord2 < vals[0] or ord1 > vals[-1]: - raise KeyError(key) - - pos = np.searchsorted(self._ndarray_values, [ord1, ord2]) - key = slice(pos[0], pos[1] + 1) - return series[key] - elif grp == freqn: - key = Period(asdt, freq=self.freq).ordinal - return com.maybe_box( - self, self._int64index.get_value(s, key), series, key - ) - else: + if is_integer(key): + return series.iat[key] + + if isinstance(key, str): + asdt, parsed, reso = parse_time_string(key, self.freq) + grp = resolution.Resolution.get_freq_group(reso) + freqn = resolution.get_freq_group(self.freq) + + vals = self._ndarray_values + + # if our data is higher resolution than requested key, slice + if grp < freqn: + iv = Period(asdt, freq=(grp, 1)) + ord1 = iv.asfreq(self.freq, how="S").ordinal + ord2 = iv.asfreq(self.freq, how="E").ordinal + + if ord2 < vals[0] or ord1 > vals[-1]: raise KeyError(key) - period = Period(key, self.freq) - key = period.value if isna(period) else period.ordinal - return com.maybe_box(self, self._int64index.get_value(s, key), series, key) - else: - return com.maybe_box(self, value, series, key) + pos = np.searchsorted(self._ndarray_values, [ord1, ord2]) + key = slice(pos[0], pos[1] + 1) + return series[key] + elif grp == freqn: + key = Period(asdt, freq=self.freq) + loc = self.get_loc(key) + return series.iloc[loc] + else: + raise KeyError(key) + + elif isinstance(key, Period) or key is NaT: + ordinal = key.ordinal if key is not NaT else NaT.value + loc = self._engine.get_loc(ordinal) + return series[loc] + + # slice, PeriodIndex, np.ndarray, List[Period] + value = Index.get_value(self, series, key) + return com.maybe_box(self, value, series, key) @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): @@ -579,36 +580,52 @@ def get_indexer_non_unique(self, target): def get_loc(self, key, method=None, tolerance=None): """ - Get integer location for requested label + Get integer location for requested label. + + Parameters + ---------- + key : Period, NaT, str, or datetime + String or datetime key must be parseable as Period. Returns ------- - loc : int + loc : int or ndarray[int64] + + Raises + ------ + KeyError + Key is not present in the index. + TypeError + If key is listlike or otherwise not hashable. """ - try: - return self._engine.get_loc(key) - except KeyError: - if is_integer(key): - raise + if isinstance(key, str): try: asdt, parsed, reso = parse_time_string(key, self.freq) key = asdt - except TypeError: - pass except DateParseError: # A string with invalid format raise KeyError(f"Cannot interpret '{key}' as period") - try: - key = Period(key, freq=self.freq) - except ValueError: - # we cannot construct the Period - # as we have an invalid type - raise KeyError(key) + elif is_integer(key): + # Period constructor will cast to string, which we dont want + raise KeyError(key) + + try: + key = Period(key, freq=self.freq) + except ValueError: + # we cannot construct the Period + # as we have an invalid type + if is_list_like(key): + raise TypeError(f"'{key}' is an invalid key") + raise KeyError(key) + + ordinal = key.ordinal if key is not NaT else key.value + try: + return self._engine.get_loc(ordinal) + except KeyError: try: - ordinal = iNaT if key is NaT else key.ordinal if tolerance is not None: tolerance = self._convert_tolerance(tolerance, np.asarray(key)) return self._int64index.get_loc(ordinal, method, tolerance) diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 2e3bf852667e5..592dccc5fc8ed 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -451,6 +451,28 @@ def test_get_loc(self): tm.assert_numpy_array_equal(idx2.get_loc(p2), expected_idx2_p2) tm.assert_numpy_array_equal(idx2.get_loc(str(p2)), expected_idx2_p2) + def test_get_loc_integer(self): + dti = pd.date_range("2016-01-01", periods=3) + pi = dti.to_period("D") + with pytest.raises(KeyError, match="16801"): + pi.get_loc(16801) + + pi2 = dti.to_period("Y") # duplicates, ordinals are all 46 + with pytest.raises(KeyError, match="46"): + pi2.get_loc(46) + + def test_get_value_integer(self): + dti = pd.date_range("2016-01-01", periods=3) + pi = dti.to_period("D") + ser = pd.Series(range(3), index=pi) + with pytest.raises(IndexError, match="is out of bounds for axis 0 with size 3"): + pi.get_value(ser, 16801) + + pi2 = dti.to_period("Y") # duplicates, ordinals are all 46 + ser2 = pd.Series(range(3), index=pi2) + with pytest.raises(IndexError, match="is out of bounds for axis 0 with size 3"): + pi2.get_value(ser2, 46) + def test_is_monotonic_increasing(self): # GH 17717 p0 = pd.Period("2017-09-01") From 9b0ef5d07fb218df4e36e133d69b1ea4c6be43bd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 14 Jan 2020 19:10:24 -0800 Subject: [PATCH 050/158] refactor DTI.get_loc (#31023) --- pandas/core/indexes/datetimes.py | 77 ++++++++++++++------------------ 1 file changed, 34 insertions(+), 43 deletions(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 75515949d1855..23ced8987d8ac 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -647,32 +647,24 @@ def get_value(self, series, key): locs = self.indexer_at_time(key) return series.take(locs) - try: - value = Index.get_value(self, series, key) - except KeyError: + if isinstance(key, str): try: loc = self._get_string_slice(key) return series[loc] except (TypeError, ValueError, KeyError): pass - try: - return self.get_value_maybe_box(series, key) - except (TypeError, ValueError, KeyError): + stamp = self._maybe_cast_for_get_loc(key) + loc = self.get_loc(stamp) + return series[loc] + except (KeyError, ValueError): raise KeyError(key) - else: - return com.maybe_box(self, value, series, key) + + value = Index.get_value(self, series, key) + return com.maybe_box(self, value, series, key) def get_value_maybe_box(self, series, key): - # needed to localize naive datetimes - if self.tz is not None: - key = Timestamp(key) - if key.tzinfo is not None: - key = key.tz_convert(self.tz) - else: - key = key.tz_localize(self.tz) - elif not isinstance(key, Timestamp): - key = Timestamp(key) + key = self._maybe_cast_for_get_loc(key) values = self._engine.get_value(com.values_from_object(series), key, tz=self.tz) return com.maybe_box(self, values, series, key) @@ -684,20 +676,31 @@ def get_loc(self, key, method=None, tolerance=None): ------- loc : int """ + if is_scalar(key) and isna(key): + key = NaT # FIXME: do this systematically if tolerance is not None: # try converting tolerance now, so errors don't get swallowed by # the try/except clauses below tolerance = self._convert_tolerance(tolerance, np.asarray(key)) - if isinstance(key, datetime): + if isinstance(key, (datetime, np.datetime64)): # needed to localize naive datetimes - if key.tzinfo is None: - key = Timestamp(key, tz=self.tz) - else: - key = Timestamp(key).tz_convert(self.tz) + key = self._maybe_cast_for_get_loc(key) return Index.get_loc(self, key, method, tolerance) + elif isinstance(key, str): + try: + return self._get_string_slice(key) + except (TypeError, KeyError, ValueError, OverflowError): + pass + + try: + stamp = self._maybe_cast_for_get_loc(key) + return Index.get_loc(self, stamp, method, tolerance) + except (KeyError, ValueError): + raise KeyError(key) + elif isinstance(key, timedelta): # GH#20464 raise TypeError( @@ -711,28 +714,16 @@ def get_loc(self, key, method=None, tolerance=None): ) return self.indexer_at_time(key) - try: - return Index.get_loc(self, key, method, tolerance) - except (KeyError, ValueError, TypeError): - try: - return self._get_string_slice(key) - except (TypeError, KeyError, ValueError, OverflowError): - pass + return Index.get_loc(self, key, method, tolerance) - try: - stamp = Timestamp(key) - if stamp.tzinfo is not None and self.tz is not None: - stamp = stamp.tz_convert(self.tz) - else: - stamp = stamp.tz_localize(self.tz) - return Index.get_loc(self, stamp, method, tolerance) - except KeyError: - raise KeyError(key) - except ValueError as e: - # list-like tolerance size must match target index size - if "list-like" in str(e): - raise e - raise KeyError(key) + def _maybe_cast_for_get_loc(self, key): + # needed to localize naive datetimes + key = Timestamp(key) + if key.tzinfo is None: + key = key.tz_localize(self.tz) + else: + key = key.tz_convert(self.tz) + return key def _maybe_cast_slice_bound(self, label, side, kind): """ From 698920f1d7ec40eaa025b7ccd0e4e2f78bcb89c8 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 14 Jan 2020 19:14:40 -0800 Subject: [PATCH 051/158] REF: do all casting _before_ call to DatetimeEngine.get_loc (#30948) --- pandas/_libs/index.pyx | 52 ++++++++++++++----------- pandas/_libs/tslibs/conversion.pxd | 2 - pandas/_libs/tslibs/conversion.pyx | 25 ------------ pandas/tests/indexes/test_engines.py | 57 ++++++++++++++++++++++++++++ 4 files changed, 86 insertions(+), 50 deletions(-) create mode 100644 pandas/tests/indexes/test_engines.py diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 07e8534b84c50..4bcdb5d96a32d 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -17,8 +17,8 @@ cnp.import_array() cimport pandas._libs.util as util -from pandas._libs.tslibs.conversion cimport maybe_datetimelike_to_i8 from pandas._libs.tslibs.nattype cimport c_NaT as NaT +from pandas._libs.tslibs.c_timestamp cimport _Timestamp from pandas._libs.hashtable cimport HashTable @@ -407,20 +407,27 @@ cdef class DatetimeEngine(Int64Engine): cdef _get_box_dtype(self): return 'M8[ns]' + cdef int64_t _unbox_scalar(self, scalar) except? -1: + # NB: caller is responsible for ensuring tzawareness compat + # before we get here + if not (isinstance(scalar, _Timestamp) or scalar is NaT): + raise TypeError(scalar) + return scalar.value + def __contains__(self, object val): cdef: - int64_t loc + int64_t loc, conv + conv = self._unbox_scalar(val) if self.over_size_threshold and self.is_monotonic_increasing: if not self.is_unique: - return self._get_loc_duplicates(val) + return self._get_loc_duplicates(conv) values = self._get_index_values() - conv = maybe_datetimelike_to_i8(val) loc = values.searchsorted(conv, side='left') return values[loc] == conv self._ensure_mapping_populated() - return maybe_datetimelike_to_i8(val) in self.mapping + return conv in self.mapping cdef _get_index_values(self): return self.vgetter().view('i8') @@ -429,23 +436,26 @@ cdef class DatetimeEngine(Int64Engine): return algos.is_monotonic(values, timelike=True) cpdef get_loc(self, object val): + # NB: the caller is responsible for ensuring that we are called + # with either a Timestamp or NaT (Timedelta or NaT for TimedeltaEngine) + cdef: int64_t loc if is_definitely_invalid_key(val): raise TypeError + try: + conv = self._unbox_scalar(val) + except TypeError: + raise KeyError(val) + # Welcome to the spaghetti factory if self.over_size_threshold and self.is_monotonic_increasing: if not self.is_unique: - val = maybe_datetimelike_to_i8(val) - return self._get_loc_duplicates(val) + return self._get_loc_duplicates(conv) values = self._get_index_values() - try: - conv = maybe_datetimelike_to_i8(val) - loc = values.searchsorted(conv, side='left') - except TypeError: - raise KeyError(val) + loc = values.searchsorted(conv, side='left') if loc == len(values) or values[loc] != conv: raise KeyError(val) @@ -453,21 +463,12 @@ cdef class DatetimeEngine(Int64Engine): self._ensure_mapping_populated() if not self.unique: - val = maybe_datetimelike_to_i8(val) - return self._get_loc_duplicates(val) + return self._get_loc_duplicates(conv) try: - return self.mapping.get_item(val.value) + return self.mapping.get_item(conv) except KeyError: raise KeyError(val) - except AttributeError: - pass - - try: - val = maybe_datetimelike_to_i8(val) - return self.mapping.get_item(val) - except (TypeError, ValueError): - raise KeyError(val) def get_indexer(self, values): self._ensure_mapping_populated() @@ -494,6 +495,11 @@ cdef class TimedeltaEngine(DatetimeEngine): cdef _get_box_dtype(self): return 'm8[ns]' + cdef int64_t _unbox_scalar(self, scalar) except? -1: + if not (isinstance(scalar, Timedelta) or scalar is NaT): + raise TypeError(scalar) + return scalar.value + cdef class PeriodEngine(Int64Engine): diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index 36e6b14be182a..d4ae3fa8c5b99 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -25,6 +25,4 @@ cdef int64_t get_datetime64_nanos(object val) except? -1 cpdef int64_t pydt_to_i8(object pydt) except? -1 -cdef maybe_datetimelike_to_i8(object val) - cpdef datetime localize_pydatetime(datetime dt, object tz) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index a2b433c2007ff..c8d354328a0f6 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -207,31 +207,6 @@ def datetime_to_datetime64(object[:] values): return result, inferred_tz -cdef inline maybe_datetimelike_to_i8(object val): - """ - Try to convert to a nanosecond timestamp. Fall back to returning the - input value. - - Parameters - ---------- - val : object - - Returns - ------- - val : int64 timestamp or original input - """ - cdef: - npy_datetimestruct dts - try: - return val.value - except AttributeError: - if is_datetime64_object(val): - return get_datetime64_value(val) - elif PyDateTime_Check(val): - return convert_datetime_to_tsobject(val, None).value - return val - - # ---------------------------------------------------------------------- # _TSObject Conversion diff --git a/pandas/tests/indexes/test_engines.py b/pandas/tests/indexes/test_engines.py new file mode 100644 index 0000000000000..ee224c9c6ec89 --- /dev/null +++ b/pandas/tests/indexes/test_engines.py @@ -0,0 +1,57 @@ +import re + +import pytest + +import pandas as pd + + +class TestDatetimeEngine: + @pytest.mark.parametrize( + "scalar", + [ + pd.Timedelta(pd.Timestamp("2016-01-01").asm8.view("m8[ns]")), + pd.Timestamp("2016-01-01").value, + pd.Timestamp("2016-01-01").to_pydatetime(), + pd.Timestamp("2016-01-01").to_datetime64(), + ], + ) + def test_not_contains_requires_timestamp(self, scalar): + dti1 = pd.date_range("2016-01-01", periods=3) + dti2 = dti1.insert(1, pd.NaT) # non-monotonic + dti3 = dti1.insert(3, dti1[0]) # non-unique + dti4 = pd.date_range("2016-01-01", freq="ns", periods=2_000_000) + dti5 = dti4.insert(0, dti4[0]) # over size threshold, not unique + + msg = "|".join([re.escape(str(scalar)), re.escape(repr(scalar))]) + for dti in [dti1, dti2, dti3, dti4, dti5]: + with pytest.raises(TypeError, match=msg): + scalar in dti._engine + + with pytest.raises(KeyError, match=msg): + dti._engine.get_loc(scalar) + + +class TestTimedeltaEngine: + @pytest.mark.parametrize( + "scalar", + [ + pd.Timestamp(pd.Timedelta(days=42).asm8.view("datetime64[ns]")), + pd.Timedelta(days=42).value, + pd.Timedelta(days=42).to_pytimedelta(), + pd.Timedelta(days=42).to_timedelta64(), + ], + ) + def test_not_contains_requires_timestamp(self, scalar): + tdi1 = pd.timedelta_range("42 days", freq="9h", periods=1234) + tdi2 = tdi1.insert(1, pd.NaT) # non-monotonic + tdi3 = tdi1.insert(3, tdi1[0]) # non-unique + tdi4 = pd.timedelta_range("42 days", freq="ns", periods=2_000_000) + tdi5 = tdi4.insert(0, tdi4[0]) # over size threshold, not unique + + msg = "|".join([re.escape(str(scalar)), re.escape(repr(scalar))]) + for tdi in [tdi1, tdi2, tdi3, tdi4, tdi5]: + with pytest.raises(TypeError, match=msg): + scalar in tdi._engine + + with pytest.raises(KeyError, match=msg): + tdi._engine.get_loc(scalar) From 586bcb16023ae870b0ad7769f6d9077903705486 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Wed, 15 Jan 2020 04:19:23 +0100 Subject: [PATCH 052/158] BUG: pivot_table with multi-index columns only fails (#31013) --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/reshape/pivot.py | 4 +++- pandas/tests/reshape/test_pivot.py | 20 ++++++++++++++------ 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 14f82c2e71519..721bcb0758992 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -140,7 +140,7 @@ Reshaping ^^^^^^^^^ - -- +- Bug in :meth:`DataFrame.pivot_table` when only MultiIndexed columns is set (:issue:`17038`) Sparse ^^^^^^ diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index b443ba142369c..7109f23761188 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -117,7 +117,9 @@ def pivot_table( agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged - if table.index.nlevels > 1: + + # GH17038, this check should only happen if index is defined (not None) + if table.index.nlevels > 1 and index: # Related GH #17123 # If index_names are integers, determine whether the integers refer # to the level position or name. diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 743fc50c87e96..e3a57da450334 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -896,12 +896,6 @@ def _check_output( totals = table.loc[("All", ""), value_col] assert totals == self.data[value_col].mean() - # no rows - rtable = self.data.pivot_table( - columns=["AA", "BB"], margins=True, aggfunc=np.mean - ) - assert isinstance(rtable, Series) - table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") for item in ["DD", "EE", "FF"]: totals = table.loc[("All", ""), item] @@ -951,6 +945,20 @@ def test_margins_dtype_len(self): tm.assert_frame_equal(expected, result) + @pytest.mark.parametrize("cols", [(1, 2), ("a", "b"), (1, "b"), ("a", 1)]) + def test_pivot_table_multiindex_only(self, cols): + # GH 17038 + df2 = DataFrame({cols[0]: [1, 2, 3], cols[1]: [1, 2, 3], "v": [4, 5, 6]}) + + result = df2.pivot_table(values="v", columns=cols) + expected = DataFrame( + [[4, 5, 6]], + columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols), + index=Index(["v"]), + ) + + tm.assert_frame_equal(result, expected) + def test_pivot_integer_columns(self): # caused by upstream bug in unstack From ef0cf1e703bad64560be08d9aeb1c4bd3621a689 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 14 Jan 2020 19:23:12 -0800 Subject: [PATCH 053/158] REF: implement _split_op_result (#31027) --- pandas/core/internals/blocks.py | 8 ++++++-- pandas/core/internals/managers.py | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f74033924f64e..5fe5290fa65f1 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -362,13 +362,17 @@ def delete(self, loc): self.values = np.delete(self.values, loc, 0) self.mgr_locs = self.mgr_locs.delete(loc) - def apply(self, func, **kwargs): + def apply(self, func, **kwargs) -> List["Block"]: """ apply the function to my values; return a block if we are not one """ with np.errstate(all="ignore"): result = func(self.values, **kwargs) + return self._split_op_result(result) + + def _split_op_result(self, result) -> List["Block"]: + # See also: split_and_operate if is_extension_array_dtype(result) and result.ndim > 1: # if we get a 2D ExtensionArray, we need to split it into 1D pieces nbs = [] @@ -382,7 +386,7 @@ def apply(self, func, **kwargs): if not isinstance(result, Block): result = self.make_block(values=_block_shape(result, ndim=self.ndim)) - return result + return [result] def fillna(self, value, limit=None, inplace=False, downcast=None): """ fillna on the block with the value. If we fail, then convert to diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 066689b3e374e..01b2c36e9adf3 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1341,7 +1341,7 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): # only one item and each mgr loc is a copy of that single # item. for mgr_loc in mgr_locs: - newblk = blk.copy(deep=True) + newblk = blk.copy(deep=False) newblk.mgr_locs = slice(mgr_loc, mgr_loc + 1) blocks.append(newblk) From 030a35cffcf1e79565e80041027ad68ba96f3f52 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 14 Jan 2020 20:22:17 -0800 Subject: [PATCH 054/158] PERF: RangeIndex.get_loc (#30930) --- pandas/core/indexes/range.py | 15 +++++++++------ pandas/tests/indexes/ranges/test_range.py | 7 ++++--- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index b4cc71a25792f..5c79942efb908 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -14,6 +14,7 @@ from pandas.core.dtypes.common import ( ensure_platform_int, ensure_python_int, + is_float, is_integer, is_integer_dtype, is_list_like, @@ -344,12 +345,14 @@ def __contains__(self, key: Union[int, np.integer]) -> bool: @Appender(_index_shared_docs["get_loc"]) def get_loc(self, key, method=None, tolerance=None): - if is_integer(key) and method is None and tolerance is None: - new_key = int(key) - try: - return self._range.index(new_key) - except ValueError: - raise KeyError(key) + if method is None and tolerance is None: + if is_integer(key) or (is_float(key) and key.is_integer()): + new_key = int(key) + try: + return self._range.index(new_key) + except ValueError: + raise KeyError(key) + raise KeyError(key) return super().get_loc(key, method=method, tolerance=tolerance) @Appender(_index_shared_docs["get_indexer"]) diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 8d98ab18963b6..0e5d1d45ad6db 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -735,8 +735,9 @@ def test_engineless_lookup(self): assert "_engine" not in idx._cache - # The engine is still required for lookup of a different dtype scalar: + # Different types of scalars can be excluded immediately, no need to + # use the _engine with pytest.raises(KeyError, match="'a'"): - assert idx.get_loc("a") == -1 + idx.get_loc("a") - assert "_engine" in idx._cache + assert "_engine" not in idx._cache From 87188775b42c67791fc85df99aa02ad7a731c19d Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 14 Jan 2020 20:31:50 -0800 Subject: [PATCH 055/158] ENH/TST: Allow more keywords to ensure_clean (#30915) These keywords will be passed through to tempfile constructor functions. Follow-up: https://github.com/pandas-dev/pandas/pull/30771 --- pandas/_testing.py | 22 ++++++++++++++++++---- pandas/tests/io/parser/test_common.py | 16 +++++++--------- pandas/tests/io/parser/test_encoding.py | 3 +-- 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index 018551224c582..631d550c60534 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -473,7 +473,7 @@ def close(fignum=None): @contextmanager -def ensure_clean(filename=None, return_filelike=False): +def ensure_clean(filename=None, return_filelike=False, **kwargs): """ Gets a temporary path and agrees to remove on close. @@ -485,23 +485,37 @@ def ensure_clean(filename=None, return_filelike=False): return_filelike : bool (default False) if True, returns a file-like which is *always* cleaned. Necessary for savefig and other functions which want to append extensions. + **kwargs + Additional keywords passed in for creating a temporary file. + :meth:`tempFile.TemporaryFile` is used when `return_filelike` is ``True``. + :meth:`tempfile.mkstemp` is used when `return_filelike` is ``False``. + Note that the `filename` parameter will be passed in as the `suffix` + argument to either function. + + See Also + -------- + tempfile.TemporaryFile + tempfile.mkstemp """ filename = filename or "" fd = None + kwargs["suffix"] = filename + if return_filelike: - f = tempfile.TemporaryFile(suffix=filename) + f = tempfile.TemporaryFile(**kwargs) + try: yield f finally: f.close() else: - # don't generate tempfile if using a path with directory specified + # Don't generate tempfile if using a path with directory specified. if len(os.path.dirname(filename)): raise ValueError("Can't pass a qualified name to ensure_clean()") try: - fd, filename = tempfile.mkstemp(suffix=filename) + fd, filename = tempfile.mkstemp(**kwargs) except UnicodeEncodeError: import pytest diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 4c02a37b66455..6c17f40b790ac 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -8,7 +8,6 @@ from io import StringIO import os import platform -from tempfile import TemporaryFile from urllib.error import URLError import numpy as np @@ -1847,16 +1846,15 @@ def test_temporary_file(all_parsers): parser = all_parsers data = "0 0" - new_file = TemporaryFile("w+") - new_file.write(data) - new_file.flush() - new_file.seek(0) + with tm.ensure_clean(mode="w+", return_filelike=True) as new_file: + new_file.write(data) + new_file.flush() + new_file.seek(0) - result = parser.read_csv(new_file, sep=r"\s+", header=None) - new_file.close() + result = parser.read_csv(new_file, sep=r"\s+", header=None) - expected = DataFrame([[0, 0]]) - tm.assert_frame_equal(result, expected) + expected = DataFrame([[0, 0]]) + tm.assert_frame_equal(result, expected) def test_internal_eof_byte(all_parsers): diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 33abf4bb7d9ee..406e7bedfd298 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -5,7 +5,6 @@ from io import BytesIO import os -import tempfile import numpy as np import pytest @@ -164,7 +163,7 @@ def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding) expected = DataFrame({"foo": ["bar"]}) - with tempfile.TemporaryFile(mode="w+", encoding=encoding) as f: + with tm.ensure_clean(mode="w+", encoding=encoding, return_filelike=True) as f: f.write("foo\nbar") f.seek(0) From dd94e0db9556f35a3ea91ce85714c0b1e151a770 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Wed, 15 Jan 2020 10:49:48 +0100 Subject: [PATCH 056/158] BLD: More lightweight mypy pre-commit hook (#30814) --- .pre-commit-config.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 809764a20a713..139b9e31df46c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,11 +20,11 @@ repos: rev: v0.730 hooks: - id: mypy - # We run mypy over all files because of: - # * changes in type definitions may affect non-touched files. - # * Running it with `mypy pandas` and the filenames will lead to - # spurious duplicate module errors, - # see also https://github.com/pre-commit/mirrors-mypy/issues/5 - pass_filenames: false args: - - pandas + # As long as a some files are excluded from check-untyped-defs + # we have to exclude it from the pre-commit hook as the configuration + # is based on modules but the hook runs on files. + - --no-check-untyped-defs + - --follow-imports + - skip + files: pandas/ From 2cabca8394025ad62eac6cdd0858526000f324eb Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Wed, 15 Jan 2020 13:40:34 +0000 Subject: [PATCH 057/158] CI: Fix clipboard problems (#29712) * CI: Fix inconsistent clipboard test --- .travis.yml | 8 ++++---- ci/azure/posix.yml | 8 ++++---- ci/run_tests.sh | 12 ++++++------ ci/setup_env.sh | 5 +++++ pandas/tests/io/test_clipboard.py | 11 +---------- 5 files changed, 20 insertions(+), 24 deletions(-) diff --git a/.travis.yml b/.travis.yml index c24c6f06de1fc..a23bc8a4e905f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,19 +28,19 @@ matrix: include: - env: - - JOB="3.8" ENV_FILE="ci/deps/travis-38.yaml" PATTERN="(not slow and not network)" + - JOB="3.8" ENV_FILE="ci/deps/travis-38.yaml" PATTERN="(not slow and not network and not clipboard)" - env: - - JOB="3.7" ENV_FILE="ci/deps/travis-37.yaml" PATTERN="(not slow and not network)" + - JOB="3.7" ENV_FILE="ci/deps/travis-37.yaml" PATTERN="(not slow and not network and not clipboard)" - env: - - JOB="3.6, locale" ENV_FILE="ci/deps/travis-36-locale.yaml" PATTERN="((not slow and not network) or (single and db))" LOCALE_OVERRIDE="zh_CN.UTF-8" SQL="1" + - JOB="3.6, locale" ENV_FILE="ci/deps/travis-36-locale.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" LOCALE_OVERRIDE="zh_CN.UTF-8" SQL="1" services: - mysql - postgresql - env: - - JOB="3.6, coverage" ENV_FILE="ci/deps/travis-36-cov.yaml" PATTERN="((not slow and not network) or (single and db))" PANDAS_TESTING_MODE="deprecate" COVERAGE=true SQL="1" + - JOB="3.6, coverage" ENV_FILE="ci/deps/travis-36-cov.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" PANDAS_TESTING_MODE="deprecate" COVERAGE=true SQL="1" services: - mysql - postgresql diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml index 55e8e839f4fae..c9a2e4eefd19d 100644 --- a/ci/azure/posix.yml +++ b/ci/azure/posix.yml @@ -18,7 +18,7 @@ jobs: py36_minimum_versions: ENV_FILE: ci/deps/azure-36-minimum_versions.yaml CONDA_PY: "36" - PATTERN: "not slow and not network" + PATTERN: "not slow and not network and not clipboard" py36_locale_slow_old_np: ENV_FILE: ci/deps/azure-36-locale_slow.yaml @@ -36,12 +36,12 @@ jobs: PATTERN: "not slow and not network" LANG: "it_IT.utf8" LC_ALL: "it_IT.utf8" - EXTRA_APT: "language-pack-it" + EXTRA_APT: "language-pack-it xsel" py36_32bit: ENV_FILE: ci/deps/azure-36-32bit.yaml CONDA_PY: "36" - PATTERN: "not slow and not network" + PATTERN: "not slow and not network and not clipboard" BITS32: "yes" py37_locale: @@ -50,7 +50,7 @@ jobs: PATTERN: "not slow and not network" LANG: "zh_CN.utf8" LC_ALL: "zh_CN.utf8" - EXTRA_APT: "language-pack-zh-hans" + EXTRA_APT: "language-pack-zh-hans xsel" py37_np_dev: ENV_FILE: ci/deps/azure-37-numpydev.yaml diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 8020680d617d7..0cb1f4aabf352 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -14,14 +14,14 @@ if [ "$COVERAGE" ]; then COVERAGE="-s --cov=pandas --cov-report=xml:$COVERAGE_FNAME" fi -PYTEST_CMD="pytest -m \"$PATTERN\" -n auto --dist=loadfile -s --strict --durations=10 --junitxml=test-data.xml $TEST_ARGS $COVERAGE pandas" - -# Travis does not have have an X server -if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then - DISPLAY=DISPLAY=:99.0 - PYTEST_CMD="xvfb-run -e /dev/stdout $PYTEST_CMD" +# If no X server is found, we use xvfb to emulate it +if [[ $(uname) == "Linux" && -z $DISPLAY ]]; then + export DISPLAY=":0" + XVFB="xvfb-run " fi +PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n auto --dist=loadfile -s --strict --durations=10 --junitxml=test-data.xml $TEST_ARGS $COVERAGE pandas" + echo $PYTEST_CMD sh -c "$PYTEST_CMD" diff --git a/ci/setup_env.sh b/ci/setup_env.sh index db28eaea8956e..e5bee09fe2f79 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -114,6 +114,11 @@ echo "remove postgres if has been installed with conda" echo "we use the one from the CI" conda remove postgresql -y --force || true +echo +echo "remove qt" +echo "causes problems with the clipboard, we use xsel for that" +conda remove qt -y --force || true + echo echo "conda list pandas" conda list pandas diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index a69e5556f3e85..652cacaf14ffb 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -8,13 +8,7 @@ from pandas import DataFrame, get_option, read_clipboard import pandas._testing as tm -from pandas.io.clipboard import PyperclipException, clipboard_get, clipboard_set - -try: - DataFrame({"A": [1, 2]}).to_clipboard() - _DEPS_INSTALLED = 1 -except (PyperclipException, RuntimeError): - _DEPS_INSTALLED = 0 +from pandas.io.clipboard import clipboard_get, clipboard_set def build_kwargs(sep, excel): @@ -148,7 +142,6 @@ def test_mock_clipboard(mock_clipboard): @pytest.mark.single @pytest.mark.clipboard -@pytest.mark.skipif(not _DEPS_INSTALLED, reason="clipboard primitives not installed") @pytest.mark.usefixtures("mock_clipboard") class TestClipboard: def check_round_trip_frame(self, data, excel=None, sep=None, encoding=None): @@ -256,9 +249,7 @@ def test_round_trip_valid_encodings(self, enc, df): @pytest.mark.single @pytest.mark.clipboard -@pytest.mark.skipif(not _DEPS_INSTALLED, reason="clipboard primitives not installed") @pytest.mark.parametrize("data", ["\U0001f44d...", "Ωœ∑´...", "abcd..."]) -@pytest.mark.xfail(reason="flaky in CI", strict=False) def test_raw_roundtrip(data): # PR #25040 wide unicode wasn't copied correctly on PY3 on windows clipboard_set(data) From 9a686358b68bf9754fc4012a6def91609b971522 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 15 Jan 2020 06:19:17 -0800 Subject: [PATCH 058/158] BUG: SystemError in df.sum (#30905) --- pandas/_libs/tslibs/c_timestamp.pyx | 5 +++-- pandas/tests/frame/test_analytics.py | 10 ++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/c_timestamp.pyx b/pandas/_libs/tslibs/c_timestamp.pyx index 6e6b809b9b5a6..ed1df5f4fa595 100644 --- a/pandas/_libs/tslibs/c_timestamp.pyx +++ b/pandas/_libs/tslibs/c_timestamp.pyx @@ -57,11 +57,12 @@ def integer_op_not_supported(obj): # the caller; mypy finds this more palatable. cls = type(obj).__name__ + # GH#30886 using an fstring raises SystemError int_addsub_msg = ( - f"Addition/subtraction of integers and integer-arrays with {cls} is " + "Addition/subtraction of integers and integer-arrays with {cls} is " "no longer supported. Instead of adding/subtracting `n`, " "use `n * obj.freq`" - ) + ).format(cls=cls) return TypeError(int_addsub_msg) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 910230c737a2a..25b2997eb088f 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -823,6 +823,16 @@ def test_sum_bool(self, float_frame): bools.sum(1) bools.sum(0) + def test_sum_mixed_datetime(self): + # GH#30886 + df = pd.DataFrame( + {"A": pd.date_range("2000", periods=4), "B": [1, 2, 3, 4]} + ).reindex([2, 3, 4]) + result = df.sum() + + expected = pd.Series({"B": 7.0}) + tm.assert_series_equal(result, expected) + def test_mean_corner(self, float_frame, float_string_frame): # unit test when have object data the_mean = float_string_frame.mean(axis=0) From bc9d329ba83795845be6aa455178e2f8d753542b Mon Sep 17 00:00:00 2001 From: Alex Kirko Date: Wed, 15 Jan 2020 19:32:36 +0300 Subject: [PATCH 059/158] BUG: Fix Timestamp constructor changes value on ambiguous DST (#30995) * BUG: fix Timestamp constructor value change on DST Timestamp * BUG: expand if check for Linux * CLN: switch from isinstance to treat_tz_asdateutil * CLN: add comment to solution and move to v1.1.0 * DOC: reword bugfix comment --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/_libs/tslibs/conversion.pyx | 10 +++++++++- pandas/tests/indexes/datetimes/test_timezones.py | 8 +------- pandas/tests/scalar/timestamp/test_timestamp.py | 11 +++++++++++ 4 files changed, 22 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 721bcb0758992..b5a7b19f160a4 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -59,6 +59,7 @@ Categorical Datetimelike ^^^^^^^^^^^^ +- Bug in :class:`Timestamp` where constructing :class:`Timestamp` from ambiguous epoch time and calling constructor again changed :meth:`Timestamp.value` property (:issue:`24329`) - - diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index c8d354328a0f6..77f46016ee846 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -29,7 +29,7 @@ from pandas._libs.tslibs.util cimport ( from pandas._libs.tslibs.timedeltas cimport cast_from_unit from pandas._libs.tslibs.timezones cimport ( is_utc, is_tzlocal, is_fixed_offset, get_utcoffset, get_dst_info, - get_timezone, maybe_get_tz, tz_compare) + get_timezone, maybe_get_tz, tz_compare, treat_tz_as_dateutil) from pandas._libs.tslibs.timezones import UTC from pandas._libs.tslibs.parsing import parse_datetime_string @@ -362,6 +362,14 @@ cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, obj.tzinfo = tz else: obj.value = pydatetime_to_dt64(ts, &obj.dts) + # GH 24329 When datetime is ambiguous, + # pydatetime_to_dt64 doesn't take DST into account + # but with dateutil timezone, get_utcoffset does + # so we need to correct for it + if treat_tz_as_dateutil(ts.tzinfo): + if ts.tzinfo.is_ambiguous(ts): + dst_offset = ts.tzinfo.dst(ts) + obj.value += int(dst_offset.total_seconds() * 1e9) obj.tzinfo = ts.tzinfo if obj.tzinfo is not None and not is_utc(obj.tzinfo): diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 1505ac1dff29c..df64820777f3f 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -573,13 +573,7 @@ def test_dti_construction_ambiguous_endpoint(self, tz): "2013-10-26 23:00", "2013-10-27 01:00", freq="H", tz=tz, ambiguous="infer" ) assert times[0] == Timestamp("2013-10-26 23:00", tz=tz, freq="H") - - if str(tz).startswith("dateutil"): - # fixed ambiguous behavior - # see GH#14621 - assert times[-1] == Timestamp("2013-10-27 01:00:00+0100", tz=tz, freq="H") - else: - assert times[-1] == Timestamp("2013-10-27 01:00:00+0000", tz=tz, freq="H") + assert times[-1] == Timestamp("2013-10-27 01:00:00+0000", tz=tz, freq="H") @pytest.mark.parametrize( "tz, option, expected", diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index f1fcf46a936fd..c60406fdbc8a6 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -1081,3 +1081,14 @@ def test_dt_subclass_add_timedelta(lh, rh): result = lh + rh expected = SubDatetime(2000, 1, 1, 1) assert result == expected + + +def test_constructor_ambigous_dst(): + # GH 24329 + # Make sure that calling Timestamp constructor + # on Timestamp created from ambiguous time + # doesn't change Timestamp.value + ts = Timestamp(1382835600000000000, tz="dateutil/Europe/London") + expected = ts.value + result = Timestamp(ts).value + assert result == expected From c139f08b525cae2149caf8414c5f40bd8ae65afd Mon Sep 17 00:00:00 2001 From: Bart Date: Thu, 16 Jan 2020 01:15:17 +0100 Subject: [PATCH 060/158] a -> an (#31033) --- doc/source/getting_started/dsintro.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/getting_started/dsintro.rst b/doc/source/getting_started/dsintro.rst index 8bd271815549d..81a2f0ae7d162 100644 --- a/doc/source/getting_started/dsintro.rst +++ b/doc/source/getting_started/dsintro.rst @@ -136,7 +136,7 @@ Like a NumPy array, a pandas Series has a :attr:`~Series.dtype`. This is often a NumPy dtype. However, pandas and 3rd-party libraries extend NumPy's type system in a few places, in which case the dtype would -be a :class:`~pandas.api.extensions.ExtensionDtype`. Some examples within +be an :class:`~pandas.api.extensions.ExtensionDtype`. Some examples within pandas are :ref:`categorical` and :ref:`integer_na`. See :ref:`basics.dtypes` for more. From 2170f4e49bdd6e7bb0441f142fce9528c0c6c422 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Thu, 16 Jan 2020 02:01:22 +0000 Subject: [PATCH 061/158] CI: Using docstring validator from numpydoc (#30746) --- ci/code_checks.sh | 4 +- environment.yml | 2 +- requirements-dev.txt | 4 +- scripts/tests/test_validate_docstrings.py | 1131 +-------------------- scripts/validate_docstrings.py | 833 ++------------- 5 files changed, 163 insertions(+), 1811 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 83ceb11dfcbf4..0cc42be42d61e 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -314,8 +314,8 @@ fi ### DOCSTRINGS ### if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then - MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, GL10, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA01, SA02, SA03, SA05)' ; echo $MSG - $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA01,SA02,SA03,SA05 + MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, GL10, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA02, SA03, SA05)' ; echo $MSG + $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA02,SA03,SA05 RET=$(($RET + $?)) ; echo $MSG "DONE" fi diff --git a/environment.yml b/environment.yml index e244350a0bea0..5f1184e921119 100644 --- a/environment.yml +++ b/environment.yml @@ -27,7 +27,6 @@ dependencies: # documentation - gitpython # obtain contributors from git for whatsnew - sphinx - - numpydoc>=0.9.0 # documentation (jupyter notebooks) - nbconvert>=5.4.1 @@ -105,3 +104,4 @@ dependencies: - tabulate>=0.8.3 # DataFrame.to_markdown - pip: - git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master + - git+https://github.com/numpy/numpydoc diff --git a/requirements-dev.txt b/requirements-dev.txt index 017e6258d9941..08cbef2c7fc6b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -16,7 +16,6 @@ mypy==0.730 pycodestyle gitpython sphinx -numpydoc>=0.9.0 nbconvert>=5.4.1 nbsphinx pandoc @@ -70,4 +69,5 @@ sqlalchemy xarray pyreadstat tabulate>=0.8.3 -git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master \ No newline at end of file +git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master +git+https://github.com/numpy/numpydoc \ No newline at end of file diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index a1bccb1dd1629..b11de0c4ad860 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -1,819 +1,52 @@ -import functools import io -import random -import string import textwrap -import numpy as np import pytest import validate_docstrings -import pandas as pd -validate_one = validate_docstrings.validate_one - - -class GoodDocStrings: - """ - Collection of good doc strings. - - This class contains a lot of docstrings that should pass the validation - script without any errors. - """ - - def plot(self, kind, color="blue", **kwargs): - """ - Generate a plot. - - Render the data in the Series as a matplotlib plot of the - specified kind. - - Parameters - ---------- - kind : str - Kind of matplotlib plot. - color : str, default 'blue' - Color name or rgb code. - **kwargs - These parameters will be passed to the matplotlib plotting - function. - """ - pass - - def swap(self, arr, i, j, *args, **kwargs): - """ - Swap two indicies on an array. - - Parameters - ---------- - arr : list - The list having indexes swapped. - i, j : int - The indexes being swapped. - *args, **kwargs - Extraneous parameters are being permitted. - """ - pass - - def sample(self): - """ - Generate and return a random number. - - The value is sampled from a continuous uniform distribution between - 0 and 1. - - Returns - ------- - float - Random number generated. - """ - return random.random() - - @functools.lru_cache(None) - def decorated_sample(self, max): - """ - Generate and return a random integer between 0 and max. - - Parameters - ---------- - max : int - The maximum value of the random number. - - Returns - ------- - int - Random number generated. - """ - return random.randint(0, max) - - def random_letters(self): - """ - Generate and return a sequence of random letters. - - The length of the returned string is also random, and is also - returned. - - Returns - ------- - length : int - Length of the returned string. - letters : str - String of random letters. - """ - length = random.randint(1, 10) - letters = "".join(random.sample(string.ascii_lowercase, length)) - return length, letters - - def sample_values(self): - """ - Generate an infinite sequence of random numbers. - - The values are sampled from a continuous uniform distribution between - 0 and 1. - - Yields - ------ - float - Random number generated. - """ - while True: - yield random.random() - - def head(self): - """ - Return the first 5 elements of the Series. - - This function is mainly useful to preview the values of the - Series without displaying the whole of it. - - Returns - ------- - Series - Subset of the original series with the 5 first values. - - See Also - -------- - Series.tail : Return the last 5 elements of the Series. - Series.iloc : Return a slice of the elements in the Series, - which can also be used to return the first or last n. - """ - return self.iloc[:5] - - def head1(self, n=5): - """ - Return the first elements of the Series. - - This function is mainly useful to preview the values of the - Series without displaying the whole of it. - - Parameters - ---------- - n : int - Number of values to return. - - Returns - ------- - Series - Subset of the original series with the n first values. - - See Also - -------- - tail : Return the last n elements of the Series. - - Examples - -------- - >>> s = pd.Series(['Ant', 'Bear', 'Cow', 'Dog', 'Falcon']) - >>> s.head() - 0 Ant - 1 Bear - 2 Cow - 3 Dog - 4 Falcon - dtype: object - - With the `n` parameter, we can change the number of returned rows: - - >>> s.head(n=3) - 0 Ant - 1 Bear - 2 Cow - dtype: object - """ - return self.iloc[:n] - - def contains(self, pat, case=True, na=np.nan): - """ - Return whether each value contains `pat`. - - In this case, we are illustrating how to use sections, even - if the example is simple enough and does not require them. - - Parameters - ---------- - pat : str - Pattern to check for within each element. - case : bool, default True - Whether check should be done with case sensitivity. - na : object, default np.nan - Fill value for missing data. - - Examples - -------- - >>> s = pd.Series(['Antelope', 'Lion', 'Zebra', np.nan]) - >>> s.str.contains(pat='a') - 0 False - 1 False - 2 True - 3 NaN - dtype: object - - **Case sensitivity** - - With `case_sensitive` set to `False` we can match `a` with both - `a` and `A`: - - >>> s.str.contains(pat='a', case=False) - 0 True - 1 False - 2 True - 3 NaN - dtype: object - - **Missing values** - - We can fill missing values in the output using the `na` parameter: - - >>> s.str.contains(pat='a', na=False) - 0 False - 1 False - 2 True - 3 False - dtype: bool - """ - pass - - def mode(self, axis, numeric_only): - """ - Ensure reST directives don't affect checks for leading periods. - - Parameters - ---------- - axis : str - Sentence ending in period, followed by single directive. - - .. versionchanged:: 0.1.2 - - numeric_only : bool - Sentence ending in period, followed by multiple directives. - - .. versionadded:: 0.1.2 - .. deprecated:: 0.00.0 - A multiline description, - which spans another line. - """ - pass - - def good_imports(self): - """ - Ensure import other than numpy and pandas are fine. - - Examples - -------- - This example does not import pandas or import numpy. - >>> import datetime - >>> datetime.MAXYEAR - 9999 - """ - pass - - def no_returns(self): - """ - Say hello and have no returns. - """ - pass - - def empty_returns(self): - """ - Say hello and always return None. - - Since this function never returns a value, this - docstring doesn't need a return section. - """ - - def say_hello(): - return "Hello World!" - - say_hello() - if True: - return - else: - return None - - def multiple_variables_on_one_line(self, matrix, a, b, i, j): - """ - Swap two values in a matrix. - - Parameters - ---------- - matrix : list of list - A double list that represents a matrix. - a, b : int - The indicies of the first value. - i, j : int - The indicies of the second value. - """ - pass - - -class BadGenericDocStrings: - """Everything here has a bad docstring - """ - - def func(self): - - """Some function. - - With several mistakes in the docstring. - - It has a blank like after the signature `def func():`. - - The text 'Some function' should go in the line after the - opening quotes of the docstring, not in the same line. - - There is a blank line between the docstring and the first line - of code `foo = 1`. - - The closing quotes should be in the next line, not in this one.""" - - foo = 1 - bar = 2 - return foo + bar - - def astype(self, dtype): - """ - Casts Series type. - - Verb in third-person of the present simple, should be infinitive. - """ - pass - - def astype1(self, dtype): - """ - Method to cast Series type. - - Does not start with verb. - """ - pass - - def astype2(self, dtype): - """ - Cast Series type - - Missing dot at the end. - """ - pass - - def astype3(self, dtype): - """ - Cast Series type from its current type to the new type defined in - the parameter dtype. - - Summary is too verbose and doesn't fit in a single line. - """ - pass - - def two_linebreaks_between_sections(self, foo): - """ - Test linebreaks message GL03. - - Note 2 blank lines before parameters section. - - - Parameters - ---------- - foo : str - Description of foo parameter. - """ - pass - - def linebreak_at_end_of_docstring(self, foo): - """ - Test linebreaks message GL03. - - Note extra blank line at end of docstring. - - Parameters - ---------- - foo : str - Description of foo parameter. - - """ - pass - - def plot(self, kind, **kwargs): - """ - Generate a plot. - - Render the data in the Series as a matplotlib plot of the - specified kind. - - Note the blank line between the parameters title and the first - parameter. Also, note that after the name of the parameter `kind` - and before the colon, a space is missing. - - Also, note that the parameter descriptions do not start with a - capital letter, and do not finish with a dot. - - Finally, the `**kwargs` parameter is missing. - - Parameters - ---------- - - kind: str - kind of matplotlib plot - """ - pass - - def method(self, foo=None, bar=None): - """ - A sample DataFrame method. - - Do not import numpy and pandas. - - Try to use meaningful data, when it makes the example easier - to understand. - - Try to avoid positional arguments like in `df.method(1)`. They - can be alright if previously defined with a meaningful name, - like in `present_value(interest_rate)`, but avoid them otherwise. - - When presenting the behavior with different parameters, do not place - all the calls one next to the other. Instead, add a short sentence - explaining what the example shows. - - Examples - -------- - >>> import numpy as np - >>> import pandas as pd - >>> df = pd.DataFrame(np.ones((3, 3)), - ... columns=('a', 'b', 'c')) - >>> df.all(1) - 0 True - 1 True - 2 True - dtype: bool - >>> df.all(bool_only=True) - Series([], dtype: bool) - """ - pass - - def private_classes(self): - """ - This mentions NDFrame, which is not correct. - """ - - def unknown_section(self): - """ - This section has an unknown section title. - - Unknown Section - --------------- - This should raise an error in the validation. - """ - - def sections_in_wrong_order(self): - """ - This docstring has the sections in the wrong order. - - Parameters - ---------- - name : str - This section is in the right position. - - Examples - -------- - >>> print('So far Examples is good, as it goes before Parameters') - So far Examples is good, as it goes before Parameters - - See Also - -------- - function : This should generate an error, as See Also needs to go - before Examples. - """ - - def deprecation_in_wrong_order(self): - """ - This docstring has the deprecation warning in the wrong order. - - This is the extended summary. The correct order should be - summary, deprecation warning, extended summary. - - .. deprecated:: 1.0 - This should generate an error as it needs to go before - extended summary. - """ - - def method_wo_docstrings(self): - pass - - def directives_without_two_colons(self, first, second): - """ - Ensure reST directives have trailing colons. - - Parameters - ---------- - first : str - Sentence ending in period, followed by single directive w/o colons. - - .. versionchanged 0.1.2 - - second : bool - Sentence ending in period, followed by multiple directives w/o - colons. - - .. versionadded 0.1.2 - .. deprecated 0.00.0 - - """ - pass - - -class BadSummaries: - def wrong_line(self): - """Exists on the wrong line""" - pass - - def no_punctuation(self): - """ - Has the right line but forgets punctuation - """ - pass - - def no_capitalization(self): - """ - provides a lowercase summary. - """ - pass - - def no_infinitive(self): - """ - Started with a verb that is not infinitive. - """ - - def multi_line(self): - """ - Extends beyond one line - which is not correct. - """ - - def two_paragraph_multi_line(self): - """ - Extends beyond one line - which is not correct. - - Extends beyond one line, which in itself is correct but the - previous short summary should still be an issue. - """ - - -class BadParameters: - """ - Everything here has a problem with its Parameters section. - """ - - def missing_params(self, kind, **kwargs): - """ - Lacks kwargs in Parameters. - - Parameters - ---------- - kind : str - Foo bar baz. - """ - - def bad_colon_spacing(self, kind): - """ - Has bad spacing in the type line. - - Parameters - ---------- - kind: str - Needs a space after kind. - """ - - def no_description_period(self, kind): - """ - Forgets to add a period to the description. - - Parameters - ---------- - kind : str - Doesn't end with a dot - """ - - def no_description_period_with_directive(self, kind): - """ - Forgets to add a period, and also includes a directive. - - Parameters - ---------- - kind : str - Doesn't end with a dot - - .. versionadded:: 0.00.0 - """ - - def no_description_period_with_directives(self, kind): - """ - Forgets to add a period, and also includes multiple directives. - - Parameters - ---------- - kind : str - Doesn't end with a dot - - .. versionchanged:: 0.00.0 - .. deprecated:: 0.00.0 - """ - - def parameter_capitalization(self, kind): - """ - Forgets to capitalize the description. - - Parameters - ---------- - kind : str - this is not capitalized. - """ - - def blank_lines(self, kind): - """ - Adds a blank line after the section header. - - Parameters - ---------- - - kind : str - Foo bar baz. - """ - pass - - def integer_parameter(self, kind): - """ - Uses integer instead of int. - - Parameters - ---------- - kind : integer - Foo bar baz. - """ - pass - - def string_parameter(self, kind): - """ - Uses string instead of str. - - Parameters - ---------- - kind : string - Foo bar baz. - """ - pass - - def boolean_parameter(self, kind): - """ - Uses boolean instead of bool. - - Parameters - ---------- - kind : boolean - Foo bar baz. - """ - pass - - def list_incorrect_parameter_type(self, kind): - """ - Uses list of boolean instead of list of bool. - - Parameters - ---------- - kind : list of boolean, integer, float or string - Foo bar baz. - """ - pass - - def bad_parameter_spacing(self, a, b): - """ - The parameters on the same line have an extra space between them. - - Parameters - ---------- - a, b : int - Foo bar baz. - """ - pass - - -class BadReturns: - def return_not_documented(self): - """ - Lacks section for Returns - """ - return "Hello world!" - - def yield_not_documented(self): - """ - Lacks section for Yields - """ - yield "Hello world!" - - def no_type(self): - """ - Returns documented but without type. - - Returns - ------- - Some value. - """ - return "Hello world!" - - def no_description(self): - """ - Provides type but no description. - - Returns - ------- - str - """ - return "Hello world!" - - def no_punctuation(self): - """ - Provides type and description but no period. - - Returns - ------- - str - A nice greeting - """ - return "Hello world!" - - def named_single_return(self): - """ - Provides name but returns only one value. - - Returns - ------- - s : str - A nice greeting. - """ - return "Hello world!" - - def no_capitalization(self): - """ - Forgets capitalization in return values description. - - Returns - ------- - foo : str - The first returned string. - bar : str - the second returned string. - """ - return "Hello", "World!" +class BadDocstrings: + """Everything here has a bad docstring + """ - def no_period_multi(self): + def private_classes(self): """ - Forgets period in return values description. - - Returns - ------- - foo : str - The first returned string - bar : str - The second returned string. + This mentions NDFrame, which is not correct. """ - return "Hello", "World!" - -class BadSeeAlso: - def desc_no_period(self): + def prefix_pandas(self): """ - Return the first 5 elements of the Series. + Have `pandas` prefix in See Also section. See Also -------- - Series.tail : Return the last 5 elements of the Series. - Series.iloc : Return a slice of the elements in the Series, - which can also be used to return the first or last n + pandas.Series.rename : Alter Series index labels or name. + DataFrame.head : The first `n` rows of the caller object. """ pass - def desc_first_letter_lowercase(self): - """ - Return the first 5 elements of the Series. - - See Also - -------- - Series.tail : return the last 5 elements of the Series. - Series.iloc : Return a slice of the elements in the Series, - which can also be used to return the first or last n. + def redundant_import(self, foo=None, bar=None): """ - pass + A sample DataFrame method. - def prefix_pandas(self): - """ - Have `pandas` prefix in See Also section. + Should not import numpy and pandas. - See Also + Examples -------- - pandas.Series.rename : Alter Series index labels or name. - DataFrame.head : The first `n` rows of the caller object. + >>> import numpy as np + >>> import pandas as pd + >>> df = pd.DataFrame(np.ones((3, 3)), + ... columns=('a', 'b', 'c')) + >>> df.all(1) + 0 True + 1 True + 2 True + dtype: bool + >>> df.all(bool_only=True) + Series([], dtype: bool) """ pass - -class BadExamples: def unused_import(self): """ Examples @@ -877,59 +110,9 @@ def _import_path(self, klass=None, func=None): return base_path - def test_good_class(self, capsys): - errors = validate_one(self._import_path(klass="GoodDocStrings"))["errors"] - assert isinstance(errors, list) - assert not errors - - @pytest.mark.parametrize( - "func", - [ - "plot", - "swap", - "sample", - "decorated_sample", - "random_letters", - "sample_values", - "head", - "head1", - "contains", - "mode", - "good_imports", - "no_returns", - "empty_returns", - "multiple_variables_on_one_line", - ], - ) - def test_good_functions(self, capsys, func): - errors = validate_one(self._import_path(klass="GoodDocStrings", func=func))[ - "errors" - ] - assert isinstance(errors, list) - assert not errors - def test_bad_class(self, capsys): - errors = validate_one(self._import_path(klass="BadGenericDocStrings"))["errors"] - assert isinstance(errors, list) - assert errors - - @pytest.mark.parametrize( - "func", - [ - "func", - "astype", - "astype1", - "astype2", - "astype3", - "plot", - "method", - "private_classes", - "directives_without_two_colons", - ], - ) - def test_bad_generic_functions(self, capsys, func): - errors = validate_one( - self._import_path(klass="BadGenericDocStrings", func=func) # noqa:F821 + errors = validate_docstrings.pandas_validate( + self._import_path(klass="BadDocstrings") )["errors"] assert isinstance(errors, list) assert errors @@ -937,9 +120,8 @@ def test_bad_generic_functions(self, capsys, func): @pytest.mark.parametrize( "klass,func,msgs", [ - # See Also tests ( - "BadGenericDocStrings", + "BadDocstrings", "private_classes", ( "Private classes (NDFrame) should not be mentioned in public " @@ -947,200 +129,31 @@ def test_bad_generic_functions(self, capsys, func): ), ), ( - "BadGenericDocStrings", - "unknown_section", - ('Found unknown section "Unknown Section".',), - ), - ( - "BadGenericDocStrings", - "sections_in_wrong_order", - ( - "Sections are in the wrong order. Correct order is: Parameters, " - "See Also, Examples", - ), - ), - ( - "BadGenericDocStrings", - "deprecation_in_wrong_order", - ("Deprecation warning should precede extended summary",), - ), - ( - "BadGenericDocStrings", - "directives_without_two_colons", - ( - "reST directives ['versionchanged', 'versionadded', " - "'deprecated'] must be followed by two colons", - ), - ), - ( - "BadSeeAlso", - "desc_no_period", - ('Missing period at end of description for See Also "Series.iloc"',), - ), - ( - "BadSeeAlso", - "desc_first_letter_lowercase", - ('should be capitalized for See Also "Series.tail"',), - ), - # Summary tests - ( - "BadSummaries", - "wrong_line", - ("should start in the line immediately after the opening quotes",), - ), - ("BadSummaries", "no_punctuation", ("Summary does not end with a period",)), - ( - "BadSummaries", - "no_capitalization", - ("Summary does not start with a capital letter",), - ), - ( - "BadSummaries", - "no_capitalization", - ("Summary must start with infinitive verb",), - ), - ("BadSummaries", "multi_line", ("Summary should fit in a single line",)), - ( - "BadSummaries", - "two_paragraph_multi_line", - ("Summary should fit in a single line",), - ), - # Parameters tests - ( - "BadParameters", - "missing_params", - ("Parameters {**kwargs} not documented",), - ), - ( - "BadParameters", - "bad_colon_spacing", - ( - 'Parameter "kind" requires a space before the colon ' - "separating the parameter name and type", - ), - ), - ( - "BadParameters", - "no_description_period", - ('Parameter "kind" description should finish with "."',), - ), - ( - "BadParameters", - "no_description_period_with_directive", - ('Parameter "kind" description should finish with "."',), - ), - ( - "BadParameters", - "parameter_capitalization", - ('Parameter "kind" description should start with a capital letter',), - ), - ( - "BadParameters", - "integer_parameter", - ('Parameter "kind" type should use "int" instead of "integer"',), - ), - ( - "BadParameters", - "string_parameter", - ('Parameter "kind" type should use "str" instead of "string"',), - ), - ( - "BadParameters", - "boolean_parameter", - ('Parameter "kind" type should use "bool" instead of "boolean"',), - ), - ( - "BadParameters", - "list_incorrect_parameter_type", - ('Parameter "kind" type should use "bool" instead of "boolean"',), - ), - ( - "BadParameters", - "list_incorrect_parameter_type", - ('Parameter "kind" type should use "int" instead of "integer"',), - ), - ( - "BadParameters", - "list_incorrect_parameter_type", - ('Parameter "kind" type should use "str" instead of "string"',), - ), - ( - "BadParameters", - "bad_parameter_spacing", - ("Parameters {b} not documented", "Unknown parameters { b}"), - ), - pytest.param( - "BadParameters", - "blank_lines", - ("No error yet?",), - marks=pytest.mark.xfail, - ), - # Returns tests - ("BadReturns", "return_not_documented", ("No Returns section found",)), - ("BadReturns", "yield_not_documented", ("No Yields section found",)), - pytest.param("BadReturns", "no_type", ("foo",), marks=pytest.mark.xfail), - ("BadReturns", "no_description", ("Return value has no description",)), - ( - "BadReturns", - "no_punctuation", - ('Return value description should finish with "."',), - ), - ( - "BadReturns", - "named_single_return", + "BadDocstrings", + "prefix_pandas", ( - "The first line of the Returns section should contain only the " - "type, unless multiple values are being returned", + "pandas.Series.rename in `See Also` section " + "does not need `pandas` prefix", ), ), - ( - "BadReturns", - "no_capitalization", - ("Return value description should start with a capital letter",), - ), - ( - "BadReturns", - "no_period_multi", - ('Return value description should finish with "."',), - ), # Examples tests ( - "BadGenericDocStrings", - "method", + "BadDocstrings", + "redundant_import", ("Do not import numpy, as it is imported automatically",), ), ( - "BadGenericDocStrings", - "method", + "BadDocstrings", + "redundant_import", ("Do not import pandas, as it is imported automatically",), ), ( - "BadGenericDocStrings", - "method_wo_docstrings", - ("The object does not have a docstring",), - ), - # See Also tests - ( - "BadSeeAlso", - "prefix_pandas", - ( - "pandas.Series.rename in `See Also` section " - "does not need `pandas` prefix", - ), - ), - # Examples tests - ( - "BadExamples", + "BadDocstrings", "unused_import", ("flake8 error: F401 'pandas as pdf' imported but unused",), ), ( - "BadExamples", - "indentation_is_not_a_multiple_of_four", - ("flake8 error: E111 indentation is not a multiple of four",), - ), - ( - "BadExamples", + "BadDocstrings", "missing_whitespace_around_arithmetic_operator", ( "flake8 error: " @@ -1148,39 +161,28 @@ def test_bad_generic_functions(self, capsys, func): ), ), ( - "BadExamples", - "missing_whitespace_after_comma", - ("flake8 error: E231 missing whitespace after ',' (3 times)",), - ), - ( - "BadGenericDocStrings", - "two_linebreaks_between_sections", - ( - "Double line break found; please use only one blank line to " - "separate sections or paragraphs, and do not leave blank lines " - "at the end of docstrings", - ), + "BadDocstrings", + "indentation_is_not_a_multiple_of_four", + ("flake8 error: E111 indentation is not a multiple of four",), ), ( - "BadGenericDocStrings", - "linebreak_at_end_of_docstring", - ( - "Double line break found; please use only one blank line to " - "separate sections or paragraphs, and do not leave blank lines " - "at the end of docstrings", - ), + "BadDocstrings", + "missing_whitespace_after_comma", + ("flake8 error: E231 missing whitespace after ',' (3 times)",), ), ], ) def test_bad_docstrings(self, capsys, klass, func, msgs): - result = validate_one(self._import_path(klass=klass, func=func)) + result = validate_docstrings.pandas_validate( + self._import_path(klass=klass, func=func) + ) for msg in msgs: assert msg in " ".join(err[1] for err in result["errors"]) def test_validate_all_ignore_deprecated(self, monkeypatch): monkeypatch.setattr( validate_docstrings, - "validate_one", + "pandas_validate", lambda func_name: { "docstring": "docstring1", "errors": [ @@ -1285,50 +287,22 @@ def test_item_subsection(self, idx, subsection): assert result[idx][3] == subsection -class TestDocstringClass: - @pytest.mark.parametrize( - "name, expected_obj", - [ - ("pandas.isnull", pd.isnull), - ("pandas.DataFrame", pd.DataFrame), - ("pandas.Series.sum", pd.Series.sum), - ], - ) - def test_resolves_class_name(self, name, expected_obj): - d = validate_docstrings.Docstring(name) - assert d.obj is expected_obj - - @pytest.mark.parametrize("invalid_name", ["panda", "panda.DataFrame"]) - def test_raises_for_invalid_module_name(self, invalid_name): - msg = f'No module can be imported from "{invalid_name}"' - with pytest.raises(ImportError, match=msg): - validate_docstrings.Docstring(invalid_name) - - @pytest.mark.parametrize( - "invalid_name", ["pandas.BadClassName", "pandas.Series.bad_method_name"] - ) - def test_raises_for_invalid_attribute_name(self, invalid_name): - name_components = invalid_name.split(".") - obj_name, invalid_attr_name = name_components[-2], name_components[-1] - msg = f"'{obj_name}' has no attribute '{invalid_attr_name}'" - with pytest.raises(AttributeError, match=msg): - validate_docstrings.Docstring(invalid_name) - +class TestPandasDocstringClass: @pytest.mark.parametrize( "name", ["pandas.Series.str.isdecimal", "pandas.Series.str.islower"] ) def test_encode_content_write_to_file(self, name): # GH25466 - docstr = validate_docstrings.Docstring(name).validate_pep8() + docstr = validate_docstrings.PandasDocstring(name).validate_pep8() # the list of pep8 errors should be empty assert not list(docstr) class TestMainFunction: - def test_exit_status_for_validate_one(self, monkeypatch): + def test_exit_status_for_main(self, monkeypatch): monkeypatch.setattr( validate_docstrings, - "validate_one", + "pandas_validate", lambda func_name: { "docstring": "docstring1", "errors": [ @@ -1336,8 +310,7 @@ def test_exit_status_for_validate_one(self, monkeypatch): ("ER02", "err desc"), ("ER03", "err desc"), ], - "warnings": [], - "examples_errors": "", + "examples_errs": "", }, ) exit_status = validate_docstrings.main( diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index bcf3fd5d276f5..079e9a16cfd13 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -14,19 +14,14 @@ $ ./validate_docstrings.py pandas.DataFrame.head """ import argparse -import ast import doctest -import functools import glob import importlib -import inspect import json import os -import pydoc -import re import sys import tempfile -import textwrap +from typing import List, Optional import flake8.main.application @@ -52,87 +47,15 @@ import pandas # noqa: E402 isort:skip sys.path.insert(1, os.path.join(BASE_PATH, "doc", "sphinxext")) -from numpydoc.docscrape import NumpyDocString # noqa: E402 isort:skip -from pandas.io.formats.printing import pprint_thing # noqa: E402 isort:skip +from numpydoc.validate import validate, Docstring # noqa: E402 isort:skip PRIVATE_CLASSES = ["NDFrame", "IndexOpsMixin"] -DIRECTIVES = ["versionadded", "versionchanged", "deprecated"] -DIRECTIVE_PATTERN = re.compile(rf"^\s*\.\. ({'|'.join(DIRECTIVES)})(?!::)", re.I | re.M) -ALLOWED_SECTIONS = [ - "Parameters", - "Attributes", - "Methods", - "Returns", - "Yields", - "Other Parameters", - "Raises", - "Warns", - "See Also", - "Notes", - "References", - "Examples", -] ERROR_MSGS = { - "GL01": "Docstring text (summary) should start in the line immediately " - "after the opening quotes (not in the same line, or leaving a " - "blank line in between)", - "GL02": "Closing quotes should be placed in the line after the last text " - "in the docstring (do not close the quotes in the same line as " - "the text, or leave a blank line between the last text and the " - "quotes)", - "GL03": "Double line break found; please use only one blank line to " - "separate sections or paragraphs, and do not leave blank lines " - "at the end of docstrings", "GL04": "Private classes ({mentioned_private_classes}) should not be " "mentioned in public docstrings", - "GL05": 'Tabs found at the start of line "{line_with_tabs}", please use ' - "whitespace only", - "GL06": 'Found unknown section "{section}". Allowed sections are: ' - "{allowed_sections}", - "GL07": "Sections are in the wrong order. Correct order is: {correct_sections}", - "GL08": "The object does not have a docstring", - "GL09": "Deprecation warning should precede extended summary", - "GL10": "reST directives {directives} must be followed by two colons", - "SS01": "No summary found (a short summary in a single line should be " - "present at the beginning of the docstring)", - "SS02": "Summary does not start with a capital letter", - "SS03": "Summary does not end with a period", - "SS04": "Summary contains heading whitespaces", - "SS05": "Summary must start with infinitive verb, not third person " - '(e.g. use "Generate" instead of "Generates")', - "SS06": "Summary should fit in a single line", - "ES01": "No extended summary found", - "PR01": "Parameters {missing_params} not documented", - "PR02": "Unknown parameters {unknown_params}", - "PR03": "Wrong parameters order. Actual: {actual_params}. " - "Documented: {documented_params}", - "PR04": 'Parameter "{param_name}" has no type', - "PR05": 'Parameter "{param_name}" type should not finish with "."', - "PR06": 'Parameter "{param_name}" type should use "{right_type}" instead ' - 'of "{wrong_type}"', - "PR07": 'Parameter "{param_name}" has no description', - "PR08": 'Parameter "{param_name}" description should start with a ' - "capital letter", - "PR09": 'Parameter "{param_name}" description should finish with "."', - "PR10": 'Parameter "{param_name}" requires a space before the colon ' - "separating the parameter name and type", - "RT01": "No Returns section found", - "RT02": "The first line of the Returns section should contain only the " - "type, unless multiple values are being returned", - "RT03": "Return value has no description", - "RT04": "Return value description should start with a capital letter", - "RT05": 'Return value description should finish with "."', - "YD01": "No Yields section found", - "SA01": "See Also section not found", - "SA02": "Missing period at end of description for See Also " - '"{reference_name}" reference', - "SA03": "Description should be capitalized for See Also " - '"{reference_name}" reference', - "SA04": 'Missing description for See Also "{reference_name}" reference', "SA05": "{reference_name} in `See Also` section does not need `pandas` " "prefix, use {right_reference} instead.", - "EX01": "No examples section found", "EX02": "Examples do not pass tests:\n{doctest_log}", "EX03": "flake8 error: {error_code} {error_message}{times_happening}", "EX04": "Do not import {imported_library}, as it is imported " @@ -140,29 +63,10 @@ } -def error(code, **kwargs): +def pandas_error(code, **kwargs): """ - Return a tuple with the error code and the message with variables replaced. - - This is syntactic sugar so instead of: - - `('EX02', ERROR_MSGS['EX02'].format(doctest_log=log))` - - We can simply use: - - `error('EX02', doctest_log=log)` - - Parameters - ---------- - code : str - Error code. - **kwargs - Values for the variables in the error messages - - Returns - ------- - code : str - Error code. - message : str - Error message with variables replaced. + Copy of the numpydoc error function, since ERROR_MSGS can't be updated + with our custom errors yet. """ return (code, ERROR_MSGS[code].format(**kwargs)) @@ -239,347 +143,7 @@ def get_api_items(api_doc_fd): previous_line = line -class Docstring: - def __init__(self, name): - self.name = name - obj = self._load_obj(name) - self.obj = obj - self.code_obj = self._to_original_callable(obj) - self.raw_doc = obj.__doc__ or "" - self.clean_doc = pydoc.getdoc(obj) - self.doc = NumpyDocString(self.clean_doc) - - def __len__(self) -> int: - return len(self.raw_doc) - - @staticmethod - def _load_obj(name): - """ - Import Python object from its name as string. - - Parameters - ---------- - name : str - Object name to import (e.g. pandas.Series.str.upper) - - Returns - ------- - object - Python object that can be a class, method, function... - - Examples - -------- - >>> Docstring._load_obj('pandas.Series') - - """ - for maxsplit in range(1, name.count(".") + 1): - # TODO when py3 only replace by: module, *func_parts = ... - func_name_split = name.rsplit(".", maxsplit) - module = func_name_split[0] - func_parts = func_name_split[1:] - try: - obj = importlib.import_module(module) - except ImportError: - pass - else: - continue - - if "obj" not in locals(): - raise ImportError(f'No module can be imported from "{name}"') - - for part in func_parts: - obj = getattr(obj, part) - return obj - - @staticmethod - def _to_original_callable(obj): - """ - Find the Python object that contains the source code of the object. - - This is useful to find the place in the source code (file and line - number) where a docstring is defined. It does not currently work for - all cases, but it should help find some (properties...). - """ - while True: - if inspect.isfunction(obj) or inspect.isclass(obj): - f = inspect.getfile(obj) - if f.startswith("<") and f.endswith(">"): - return None - return obj - if inspect.ismethod(obj): - obj = obj.__func__ - elif isinstance(obj, functools.partial): - obj = obj.func - elif isinstance(obj, property): - obj = obj.fget - else: - return None - - @property - def type(self): - return type(self.obj).__name__ - - @property - def is_function_or_method(self): - # TODO(py27): remove ismethod - return inspect.isfunction(self.obj) or inspect.ismethod(self.obj) - - @property - def source_file_name(self): - """ - File name where the object is implemented (e.g. pandas/core/frame.py). - """ - try: - fname = inspect.getsourcefile(self.code_obj) - except TypeError: - # In some cases the object is something complex like a cython - # object that can't be easily introspected. An it's better to - # return the source code file of the object as None, than crash - pass - else: - if fname: - fname = os.path.relpath(fname, BASE_PATH) - return fname - - @property - def source_file_def_line(self): - """ - Number of line where the object is defined in its file. - """ - try: - return inspect.getsourcelines(self.code_obj)[-1] - except (OSError, TypeError): - # In some cases the object is something complex like a cython - # object that can't be easily introspected. An it's better to - # return the line number as None, than crash - pass - - @property - def github_url(self): - url = "https://github.com/pandas-dev/pandas/blob/master/" - url += f"{self.source_file_name}#L{self.source_file_def_line}" - return url - - @property - def start_blank_lines(self): - i = None - if self.raw_doc: - for i, row in enumerate(self.raw_doc.split("\n")): - if row.strip(): - break - return i - - @property - def end_blank_lines(self): - i = None - if self.raw_doc: - for i, row in enumerate(reversed(self.raw_doc.split("\n"))): - if row.strip(): - break - return i - - @property - def double_blank_lines(self): - prev = True - for row in self.raw_doc.split("\n"): - if not prev and not row.strip(): - return True - prev = row.strip() - return False - - @property - def section_titles(self): - sections = [] - self.doc._doc.reset() - while not self.doc._doc.eof(): - content = self.doc._read_to_next_section() - if ( - len(content) > 1 - and len(content[0]) == len(content[1]) - and set(content[1]) == {"-"} - ): - sections.append(content[0]) - return sections - - @property - def summary(self): - return " ".join(self.doc["Summary"]) - - @property - def num_summary_lines(self): - return len(self.doc["Summary"]) - - @property - def extended_summary(self): - if not self.doc["Extended Summary"] and len(self.doc["Summary"]) > 1: - return " ".join(self.doc["Summary"]) - return " ".join(self.doc["Extended Summary"]) - - @property - def needs_summary(self): - return not (bool(self.summary) and bool(self.extended_summary)) - - @property - def doc_parameters(self): - parameters = {} - for names, type_, desc in self.doc["Parameters"]: - for name in names.split(", "): - parameters[name] = (type_, "".join(desc)) - return parameters - - @property - def signature_parameters(self): - def add_stars(param_name: str, info: inspect.Parameter): - """ - Add stars to *args and **kwargs parameters - """ - if info.kind == inspect.Parameter.VAR_POSITIONAL: - return f"*{param_name}" - elif info.kind == inspect.Parameter.VAR_KEYWORD: - return f"**{param_name}" - else: - return param_name - - if inspect.isclass(self.obj): - if hasattr(self.obj, "_accessors") and ( - self.name.split(".")[-1] in self.obj._accessors - ): - # accessor classes have a signature but don't want to show this - return tuple() - try: - sig = inspect.signature(self.obj) - except (TypeError, ValueError): - # Some objects, mainly in C extensions do not support introspection - # of the signature - return tuple() - - params = tuple( - add_stars(parameter, sig.parameters[parameter]) - for parameter in sig.parameters - ) - if params and params[0] in ("self", "cls"): - return params[1:] - return params - - @property - def parameter_mismatches(self): - errs = [] - signature_params = self.signature_parameters - doc_params = tuple(self.doc_parameters) - missing = set(signature_params) - set(doc_params) - if missing: - errs.append(error("PR01", missing_params=pprint_thing(missing))) - extra = set(doc_params) - set(signature_params) - if extra: - errs.append(error("PR02", unknown_params=pprint_thing(extra))) - if ( - not missing - and not extra - and signature_params != doc_params - and not (not signature_params and not doc_params) - ): - errs.append( - error( - "PR03", actual_params=signature_params, documented_params=doc_params - ) - ) - - return errs - - @property - def correct_parameters(self): - return not bool(self.parameter_mismatches) - - @property - def directives_without_two_colons(self): - return DIRECTIVE_PATTERN.findall(self.raw_doc) - - def parameter_type(self, param): - return self.doc_parameters[param][0] - - def parameter_desc(self, param): - desc = self.doc_parameters[param][1] - # Find and strip out any sphinx directives - for directive in DIRECTIVES: - full_directive = f".. {directive}" - if full_directive in desc: - # Only retain any description before the directive - desc = desc[: desc.index(full_directive)] - return desc - - @property - def see_also(self): - result = {} - for funcs, desc in self.doc["See Also"]: - for func, _ in funcs: - result[func] = "".join(desc) - - return result - - @property - def examples(self): - return self.doc["Examples"] - - @property - def returns(self): - return self.doc["Returns"] - - @property - def yields(self): - return self.doc["Yields"] - - @property - def method_source(self): - try: - source = inspect.getsource(self.obj) - except TypeError: - return "" - return textwrap.dedent(source) - - @property - def method_returns_something(self): - """ - Check if the docstrings method can return something. - - Bare returns, returns valued None and returns from nested functions are - disconsidered. - - Returns - ------- - bool - Whether the docstrings method can return something. - """ - - def get_returns_not_on_nested_functions(node): - returns = [node] if isinstance(node, ast.Return) else [] - for child in ast.iter_child_nodes(node): - # Ignore nested functions and its subtrees. - if not isinstance(child, ast.FunctionDef): - child_returns = get_returns_not_on_nested_functions(child) - returns.extend(child_returns) - return returns - - tree = ast.parse(self.method_source).body - if tree: - returns = get_returns_not_on_nested_functions(tree[0]) - return_values = [r.value for r in returns] - # Replace NameConstant nodes valued None for None. - for i, v in enumerate(return_values): - if isinstance(v, ast.NameConstant) and v.value is None: - return_values[i] = None - return any(return_values) - else: - return False - - @property - def first_line_ends_in_dot(self): - if self.doc: - return self.doc.split("\n")[0][-1] == "." - - @property - def deprecated(self): - return ".. deprecated:: " in (self.summary + self.extended_summary) - +class PandasDocstring(Docstring): @property def mentioned_private_classes(self): return [klass for klass in PRIVATE_CLASSES if klass in self.raw_doc] @@ -632,237 +196,66 @@ def validate_pep8(self): yield from application.guide.stats.statistics_for("") -def get_validation_data(doc): +def pandas_validate(func_name: str): """ - Validate the docstring. + Call the numpydoc validation, and add the errors specific to pandas. Parameters ---------- - doc : Docstring - A Docstring object with the given function name. + func_name : str + Name of the object of the docstring to validate. Returns ------- - tuple - errors : list of tuple - Errors occurred during validation. - warnings : list of tuple - Warnings occurred during validation. - examples_errs : str - Examples usage displayed along the error, otherwise empty string. - - Notes - ----- - The errors codes are defined as: - - First two characters: Section where the error happens: - * GL: Global (no section, like section ordering errors) - * SS: Short summary - * ES: Extended summary - * PR: Parameters - * RT: Returns - * YD: Yields - * RS: Raises - * WN: Warns - * SA: See Also - * NT: Notes - * RF: References - * EX: Examples - - Last two characters: Numeric error code inside the section - - For example, EX02 is the second codified error in the Examples section - (which in this case is assigned to examples that do not pass the tests). - - The error codes, their corresponding error messages, and the details on how - they are validated, are not documented more than in the source code of this - function. + dict + Information about the docstring and the errors found. """ + doc = PandasDocstring(func_name) + result = validate(func_name) - errs = [] - wrns = [] - if not doc.raw_doc: - errs.append(error("GL08")) - return errs, wrns, "" - - if doc.start_blank_lines != 1: - errs.append(error("GL01")) - if doc.end_blank_lines != 1: - errs.append(error("GL02")) - if doc.double_blank_lines: - errs.append(error("GL03")) mentioned_errs = doc.mentioned_private_classes if mentioned_errs: - errs.append(error("GL04", mentioned_private_classes=", ".join(mentioned_errs))) - for line in doc.raw_doc.splitlines(): - if re.match("^ *\t", line): - errs.append(error("GL05", line_with_tabs=line.lstrip())) - - unexpected_sections = [ - section for section in doc.section_titles if section not in ALLOWED_SECTIONS - ] - for section in unexpected_sections: - errs.append( - error("GL06", section=section, allowed_sections=", ".join(ALLOWED_SECTIONS)) + result["errors"].append( + pandas_error("GL04", mentioned_private_classes=", ".join(mentioned_errs)) ) - correct_order = [ - section for section in ALLOWED_SECTIONS if section in doc.section_titles - ] - if correct_order != doc.section_titles: - errs.append(error("GL07", correct_sections=", ".join(correct_order))) - - if doc.deprecated and not doc.extended_summary.startswith(".. deprecated:: "): - errs.append(error("GL09")) - - directives_without_two_colons = doc.directives_without_two_colons - if directives_without_two_colons: - errs.append(error("GL10", directives=directives_without_two_colons)) - - if not doc.summary: - errs.append(error("SS01")) - else: - if not doc.summary[0].isupper(): - errs.append(error("SS02")) - if doc.summary[-1] != ".": - errs.append(error("SS03")) - if doc.summary != doc.summary.lstrip(): - errs.append(error("SS04")) - elif doc.is_function_or_method and doc.summary.split(" ")[0][-1] == "s": - errs.append(error("SS05")) - if doc.num_summary_lines > 1: - errs.append(error("SS06")) - - if not doc.extended_summary: - wrns.append(("ES01", "No extended summary found")) - - # PR01: Parameters not documented - # PR02: Unknown parameters - # PR03: Wrong parameters order - errs += doc.parameter_mismatches - - for param in doc.doc_parameters: - if not param.startswith("*"): # Check can ignore var / kwargs - if not doc.parameter_type(param): - if ":" in param: - errs.append(error("PR10", param_name=param.split(":")[0])) - else: - errs.append(error("PR04", param_name=param)) - else: - if doc.parameter_type(param)[-1] == ".": - errs.append(error("PR05", param_name=param)) - common_type_errors = [ - ("integer", "int"), - ("boolean", "bool"), - ("string", "str"), - ] - for wrong_type, right_type in common_type_errors: - if wrong_type in doc.parameter_type(param): - errs.append( - error( - "PR06", - param_name=param, - right_type=right_type, - wrong_type=wrong_type, - ) - ) - if not doc.parameter_desc(param): - errs.append(error("PR07", param_name=param)) - else: - if not doc.parameter_desc(param)[0].isupper(): - errs.append(error("PR08", param_name=param)) - if doc.parameter_desc(param)[-1] != ".": - errs.append(error("PR09", param_name=param)) - - if doc.is_function_or_method: - if not doc.returns: - if doc.method_returns_something: - errs.append(error("RT01")) - else: - if len(doc.returns) == 1 and doc.returns[0].name: - errs.append(error("RT02")) - for name_or_type, type_, desc in doc.returns: - if not desc: - errs.append(error("RT03")) - else: - desc = " ".join(desc) - if not desc[0].isupper(): - errs.append(error("RT04")) - if not desc.endswith("."): - errs.append(error("RT05")) - - if not doc.yields and "yield" in doc.method_source: - errs.append(error("YD01")) - - if not doc.see_also: - wrns.append(error("SA01")) - else: + if doc.see_also: for rel_name, rel_desc in doc.see_also.items(): - if rel_desc: - if not rel_desc.endswith("."): - errs.append(error("SA02", reference_name=rel_name)) - if not rel_desc[0].isupper(): - errs.append(error("SA03", reference_name=rel_name)) - else: - errs.append(error("SA04", reference_name=rel_name)) if rel_name.startswith("pandas."): - errs.append( - error( + result["errors"].append( + pandas_error( "SA05", reference_name=rel_name, right_reference=rel_name[len("pandas.") :], ) ) - examples_errs = "" - if not doc.examples: - wrns.append(error("EX01")) - else: - examples_errs = doc.examples_errors - if examples_errs: - errs.append(error("EX02", doctest_log=examples_errs)) + result["examples_errs"] = "" + if doc.examples: + result["examples_errs"] = doc.examples_errors + if result["examples_errs"]: + result["errors"].append( + pandas_error("EX02", doctest_log=result["examples_errs"]) + ) for err in doc.validate_pep8(): - errs.append( - error( + result["errors"].append( + pandas_error( "EX03", error_code=err.error_code, error_message=err.message, - times_happening=f" ({err.count} times)" if err.count > 1 else "", + times_happening=" ({} times)".format(err.count) + if err.count > 1 + else "", ) ) examples_source_code = "".join(doc.examples_source_code) for wrong_import in ("numpy", "pandas"): - if f"import {wrong_import}" in examples_source_code: - errs.append(error("EX04", imported_library=wrong_import)) - return errs, wrns, examples_errs - - -def validate_one(func_name): - """ - Validate the docstring for the given func_name - - Parameters - ---------- - func_name : function - Function whose docstring will be evaluated (e.g. pandas.read_csv). + if "import {}".format(wrong_import) in examples_source_code: + result["errors"].append( + pandas_error("EX04", imported_library=wrong_import) + ) - Returns - ------- - dict - A dictionary containing all the information obtained from validating - the docstring. - """ - doc = Docstring(func_name) - errs, wrns, examples_errs = get_validation_data(doc) - return { - "type": doc.type, - "docstring": doc.clean_doc, - "deprecated": doc.deprecated, - "file": doc.source_file_name, - "file_line": doc.source_file_def_line, - "github_link": doc.github_url, - "errors": errs, - "warnings": wrns, - "examples_errors": examples_errs, - } + return result def validate_all(prefix, ignore_deprecated=False): @@ -887,16 +280,16 @@ def validate_all(prefix, ignore_deprecated=False): result = {} seen = {} - # functions from the API docs api_doc_fnames = os.path.join(BASE_PATH, "doc", "source", "reference", "*.rst") api_items = [] for api_doc_fname in glob.glob(api_doc_fnames): with open(api_doc_fname) as f: api_items += list(get_api_items(f)) + for func_name, func_obj, section, subsection in api_items: if prefix and not func_name.startswith(prefix): continue - doc_info = validate_one(func_name) + doc_info = pandas_validate(func_name) if ignore_deprecated and doc_info["deprecated"]: continue result[func_name] = doc_info @@ -914,100 +307,86 @@ def validate_all(prefix, ignore_deprecated=False): seen[shared_code_key] = func_name - # functions from introspecting Series and DataFrame - api_item_names = set(list(zip(*api_items))[0]) - for class_ in (pandas.Series, pandas.DataFrame): - for member in inspect.getmembers(class_): - func_name = f"pandas.{class_.__name__}.{member[0]}" - if not member[0].startswith("_") and func_name not in api_item_names: - if prefix and not func_name.startswith(prefix): - continue - doc_info = validate_one(func_name) - if ignore_deprecated and doc_info["deprecated"]: - continue - result[func_name] = doc_info - result[func_name]["in_api"] = False - return result -def main(func_name, prefix, errors, output_format, ignore_deprecated): +def print_validate_all_results( + prefix: str, + errors: Optional[List[str]], + output_format: str, + ignore_deprecated: bool, +): + if output_format not in ("default", "json", "actions"): + raise ValueError(f'Unknown output_format "{output_format}"') + + result = validate_all(prefix, ignore_deprecated) + + if output_format == "json": + sys.stdout.write(json.dumps(result)) + return 0 + + prefix = "##[error]" if output_format == "actions" else "" + exit_status = 0 + for name, res in result.items(): + for err_code, err_desc in res["errors"]: + if errors and err_code not in errors: + continue + sys.stdout.write( + f'{prefix}{res["file"]}:{res["file_line"]}:' + f"{err_code}:{name}:{err_desc}\n" + ) + exit_status += 1 + + return exit_status + + +def print_validate_one_results(func_name: str): def header(title, width=80, char="#"): full_line = char * width side_len = (width - len(title) - 2) // 2 adj = "" if len(title) % 2 == 0 else " " - title_line = f"{char * side_len} {title}{adj} {char * side_len}" + title_line = "{side} {title}{adj} {side}".format( + side=char * side_len, title=title, adj=adj + ) return f"\n{full_line}\n{title_line}\n{full_line}\n\n" - exit_status = 0 - if func_name is None: - result = validate_all(prefix, ignore_deprecated) - - if output_format == "json": - output = json.dumps(result) - else: - if output_format == "default": - output_format = "{text}\n" - elif output_format == "azure": - output_format = ( - "##vso[task.logissue type=error;" - "sourcepath={path};" - "linenumber={row};" - "code={code};" - "]{text}\n" - ) - else: - raise ValueError(f'Unknown output_format "{output_format}"') - - output = "" - for name, res in result.items(): - for err_code, err_desc in res["errors"]: - # The script would be faster if instead of filtering the - # errors after validating them, it didn't validate them - # initially. But that would complicate the code too much - if errors and err_code not in errors: - continue - exit_status += 1 - output += output_format.format( - path=res["file"], - row=res["file_line"], - code=err_code, - text=f"{name}: {err_desc}", - ) + result = pandas_validate(func_name) - sys.stdout.write(output) + sys.stderr.write(header(f"Docstring ({func_name})")) + sys.stderr.write(f"{result['docstring']}\n") - else: - result = validate_one(func_name) - sys.stderr.write(header(f"Docstring ({func_name})")) - sys.stderr.write(f"{result['docstring']}\n") - sys.stderr.write(header("Validation")) - if result["errors"]: - sys.stderr.write(f"{len(result['errors'])} Errors found:\n") - for err_code, err_desc in result["errors"]: - # Failing examples are printed at the end - if err_code == "EX02": - sys.stderr.write("\tExamples do not pass tests\n") - continue - sys.stderr.write(f"\t{err_desc}\n") - if result["warnings"]: - sys.stderr.write(f"{len(result['warnings'])} Warnings found:\n") - for wrn_code, wrn_desc in result["warnings"]: - sys.stderr.write(f"\t{wrn_desc}\n") - - if not result["errors"]: - sys.stderr.write(f'Docstring for "{func_name}" correct. :)\n') - - if result["examples_errors"]: - sys.stderr.write(header("Doctests")) - sys.stderr.write(result["examples_errors"]) + sys.stderr.write(header("Validation")) + if result["errors"]: + sys.stderr.write(f'{len(result["errors"])} Errors found:\n') + for err_code, err_desc in result["errors"]: + if err_code == "EX02": # Failing examples are printed at the end + sys.stderr.write("\tExamples do not pass tests\n") + continue + sys.stderr.write(f"\t{err_desc}\n") + elif result["errors"]: + sys.stderr.write(f'Docstring for "{func_name}" correct. :)\n') - return exit_status + if result["examples_errs"]: + sys.stderr.write(header("Doctests")) + sys.stderr.write(result["examples_errs"]) + + +def main(func_name, prefix, errors, output_format, ignore_deprecated): + """ + Main entry point. Call the validation for one or for all docstrings. + """ + if func_name is None: + return print_validate_all_results( + prefix, errors, output_format, ignore_deprecated + ) + else: + print_validate_one_results(func_name) + return 0 if __name__ == "__main__": - format_opts = "default", "json", "azure" + format_opts = "default", "json", "actions" func_help = ( "function or method to validate (e.g. pandas.DataFrame.head) " "if not provided, all docstrings are validated and returned " @@ -1020,16 +399,16 @@ def header(title, width=80, char="#"): default="default", choices=format_opts, help="format of the output when validating " - "multiple docstrings (ignored when validating one)." - f"It can be {str(format_opts)[1:-1]}", + "multiple docstrings (ignored when validating one). " + "It can be {str(format_opts)[1:-1]}", ) argparser.add_argument( "--prefix", default=None, help="pattern for the " "docstring names, in order to decide which ones " - 'will be validated. A prefix "pandas.Series.str.' - "will make the script validate all the docstrings" + 'will be validated. A prefix "pandas.Series.str."' + "will make the script validate all the docstrings " "of methods starting by this pattern. It is " "ignored if parameter function is provided", ) From 2ba6be77a42e1e10917d06af1e7fe776b2260c0f Mon Sep 17 00:00:00 2001 From: gdex1 <40249442+gdex1@users.noreply.github.com> Date: Thu, 16 Jan 2020 04:12:08 -0500 Subject: [PATCH 062/158] TST: Disallow bare pytest.raises (#31045) --- pandas/tests/arithmetic/test_object.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 799ef3492e53f..c0d3c9d4977bd 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -137,7 +137,13 @@ def test_objarr_radd_str_invalid(self, dtype, data, box_with_array): ser = Series(data, dtype=dtype) ser = tm.box_expected(ser, box_with_array) - with pytest.raises(TypeError): + msg = ( + "can only concatenate str|" + "did not contain a loop with signature matching types|" + "unsupported operand type|" + "must be str" + ) + with pytest.raises(TypeError, match=msg): "foo_" + ser @pytest.mark.parametrize("op", [operator.add, ops.radd, operator.sub, ops.rsub]) From 5d4973042f7499a79f249b578aa0d4432231ee9f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 16 Jan 2020 01:59:05 -0800 Subject: [PATCH 063/158] CLN: assorted cleanups (#31056) --- pandas/core/indexes/interval.py | 2 +- pandas/core/internals/managers.py | 6 ------ pandas/core/series.py | 2 +- pandas/tests/frame/indexing/test_indexing.py | 2 +- 4 files changed, 3 insertions(+), 9 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 1c86235f9eaa1..8da6907750ac7 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1055,7 +1055,7 @@ def equals(self, other) -> bool: if not isinstance(other, IntervalIndex): if not is_interval_dtype(other): return False - other = Index(getattr(other, ".values", other)) + other = Index(other) return ( self.left.equals(other.left) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 01b2c36e9adf3..1fce2594062d5 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -669,12 +669,6 @@ def is_numeric_mixed_type(self): self._consolidate_inplace() return all(block.is_numeric for block in self.blocks) - @property - def is_datelike_mixed_type(self): - # Warning, consolidation needs to get checked upstairs - self._consolidate_inplace() - return any(block.is_datelike for block in self.blocks) - @property def any_extension_types(self): """Whether any of the blocks in this manager are extension blocks""" diff --git a/pandas/core/series.py b/pandas/core/series.py index 33565bbedade6..01b68550391e6 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -701,7 +701,7 @@ def __array__(self, dtype=None) -> np.ndarray: Returns ------- numpy.ndarray - The values in the series converted to a :class:`numpy.ndarary` + The values in the series converted to a :class:`numpy.ndarray` with the specified `dtype`. See Also diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 40ecda7d74952..cbb9dd09bbede 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -2179,7 +2179,7 @@ def test_type_error_multiindex(self): dg = df.pivot_table(index="i", columns="c", values=["x", "y"]) with pytest.raises(TypeError, match="is an invalid key"): - str(dg[:, 0]) + dg[:, 0] index = Index(range(2), name="i") columns = MultiIndex( From c040f14574fcb7fd366858e22c5bdb393571236c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 16 Jan 2020 02:23:26 -0800 Subject: [PATCH 064/158] CLN: remove redundant return value (#31065) --- pandas/_libs/tslibs/parsing.pyx | 17 ++++++++--------- pandas/_libs/tslibs/period.pyx | 2 +- pandas/core/indexes/datetimes.py | 4 ++-- pandas/core/indexes/period.py | 11 ++++++----- pandas/core/tools/datetimes.py | 1 - pandas/tests/indexes/datetimes/test_tools.py | 8 ++++---- pandas/tests/tslibs/test_parsing.py | 16 +++++++--------- 7 files changed, 28 insertions(+), 31 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 3705b0a41fe55..ebdf7a1e29216 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -217,7 +217,7 @@ def parse_datetime_string(date_string: str, freq=None, dayfirst=False, return dt try: - dt, _, _ = _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq) + dt, _ = _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq) return dt except DateParseError: raise @@ -280,7 +280,6 @@ cdef parse_datetime_string_with_reso(str date_string, freq=None, dayfirst=False, Returns ------- datetime - datetime/dateutil.parser._result str Inferred resolution of the parsed string. @@ -297,7 +296,7 @@ cdef parse_datetime_string_with_reso(str date_string, freq=None, dayfirst=False, parsed, reso = _parse_delimited_date(date_string, dayfirst) if parsed is not None: - return parsed, parsed, reso + return parsed, reso try: return _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq) @@ -315,7 +314,7 @@ cdef parse_datetime_string_with_reso(str date_string, freq=None, dayfirst=False, raise DateParseError(err) if parsed is None: raise DateParseError(f"Could not parse {date_string}") - return parsed, parsed, reso + return parsed, reso cpdef bint _does_string_look_like_datetime(str py_string): @@ -375,7 +374,7 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, assert isinstance(date_string, str) if date_string in nat_strings: - return NaT, NaT, '' + return NaT, '' date_string = date_string.upper() date_len = len(date_string) @@ -384,7 +383,7 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, # parse year only like 2000 try: ret = default.replace(year=int(date_string)) - return ret, ret, 'year' + return ret, 'year' except ValueError: pass @@ -441,7 +440,7 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, month = (quarter - 1) * 3 + 1 ret = default.replace(year=year, month=month) - return ret, ret, 'quarter' + return ret, 'quarter' except DateParseError: raise @@ -454,14 +453,14 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, month = int(date_string[4:6]) try: ret = default.replace(year=year, month=month) - return ret, ret, 'month' + return ret, 'month' except ValueError: pass for pat in ['%Y-%m', '%b %Y', '%b-%Y']: try: ret = datetime.strptime(date_string, pat) - return ret, ret, 'month' + return ret, 'month' except ValueError: pass diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index bd57e75c72f19..3dd560ece188d 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2466,7 +2466,7 @@ class Period(_Period): if util.is_integer_object(value): value = str(value) value = value.upper() - dt, _, reso = parse_time_string(value, freq) + dt, reso = parse_time_string(value, freq) if dt is NaT: ordinal = NPY_NAT diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 23ced8987d8ac..942b51eda7d0b 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -750,7 +750,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): if isinstance(label, str): freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) - _, parsed, reso = parsing.parse_time_string(label, freq) + parsed, reso = parsing.parse_time_string(label, freq) lower, upper = self._parsed_string_to_bounds(reso, parsed) # lower, upper form the half-open interval: # [parsed, parsed + 1 freq) @@ -766,7 +766,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True): freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) - _, parsed, reso = parsing.parse_time_string(key, freq) + parsed, reso = parsing.parse_time_string(key, freq) loc = self._partial_date_slice(reso, parsed, use_lhs=use_lhs, use_rhs=use_rhs) return loc diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 123353b620bfa..a54d09e8bede0 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -5,6 +5,7 @@ from pandas._libs import index as libindex from pandas._libs.tslibs import NaT, frequencies as libfrequencies, resolution +from pandas._libs.tslibs.parsing import parse_time_string from pandas._libs.tslibs.period import Period from pandas.util._decorators import Appender, Substitution, cache_readonly @@ -44,7 +45,7 @@ from pandas.core.indexes.datetimes import DatetimeIndex, Index from pandas.core.indexes.numeric import Int64Index from pandas.core.ops import get_op_result_name -from pandas.core.tools.datetimes import DateParseError, parse_time_string +from pandas.core.tools.datetimes import DateParseError from pandas.tseries import frequencies from pandas.tseries.offsets import DateOffset, Tick @@ -511,7 +512,7 @@ def get_value(self, series, key): return series.iat[key] if isinstance(key, str): - asdt, parsed, reso = parse_time_string(key, self.freq) + asdt, reso = parse_time_string(key, self.freq) grp = resolution.Resolution.get_freq_group(reso) freqn = resolution.get_freq_group(self.freq) @@ -601,7 +602,7 @@ def get_loc(self, key, method=None, tolerance=None): if isinstance(key, str): try: - asdt, parsed, reso = parse_time_string(key, self.freq) + asdt, reso = parse_time_string(key, self.freq) key = asdt except DateParseError: # A string with invalid format @@ -659,7 +660,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): return Period(label, freq=self.freq) elif isinstance(label, str): try: - _, parsed, reso = parse_time_string(label, self.freq) + parsed, reso = parse_time_string(label, self.freq) bounds = self._parsed_string_to_bounds(reso, parsed) return bounds[0 if side == "left" else 1] except ValueError: @@ -716,7 +717,7 @@ def _get_string_slice(self, key): if not self.is_monotonic: raise ValueError("Partial indexing only valid for ordered time series") - key, parsed, reso = parse_time_string(key, self.freq) + parsed, reso = parse_time_string(key, self.freq) grp = resolution.Resolution.get_freq_group(reso) freqn = resolution.get_freq_group(self.freq) if reso in ["day", "hour", "minute", "second"] and not grp < freqn: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 898fbc6f8bc3b..84c17748c503c 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -12,7 +12,6 @@ DateParseError, _format_is_iso, _guess_datetime_format, - parse_time_string, ) from pandas._libs.tslibs.strptime import array_strptime from pandas._typing import ArrayLike diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index fe65653ba6545..a5332eaea0432 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -1874,7 +1874,7 @@ def test_parsers(self, date_str, expected, cache): # https://github.com/dateutil/dateutil/issues/217 yearfirst = True - result1, _, _ = parsing.parse_time_string(date_str, yearfirst=yearfirst) + result1, _ = parsing.parse_time_string(date_str, yearfirst=yearfirst) result2 = to_datetime(date_str, yearfirst=yearfirst) result3 = to_datetime([date_str], yearfirst=yearfirst) # result5 is used below @@ -1910,7 +1910,7 @@ def test_na_values_with_cache( def test_parsers_nat(self): # Test that each of several string-accepting methods return pd.NaT - result1, _, _ = parsing.parse_time_string("NaT") + result1, _ = parsing.parse_time_string("NaT") result2 = to_datetime("NaT") result3 = Timestamp("NaT") result4 = DatetimeIndex(["NaT"])[0] @@ -1986,7 +1986,7 @@ def test_parsers_dayfirst_yearfirst(self, cache): ) assert dateutil_result == expected - result1, _, _ = parsing.parse_time_string( + result1, _ = parsing.parse_time_string( date_str, dayfirst=dayfirst, yearfirst=yearfirst ) @@ -2016,7 +2016,7 @@ def test_parsers_timestring(self, cache): } for date_str, (exp_now, exp_def) in cases.items(): - result1, _, _ = parsing.parse_time_string(date_str) + result1, _ = parsing.parse_time_string(date_str) result2 = to_datetime(date_str) result3 = to_datetime([date_str]) result4 = Timestamp(date_str) diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 36f7ada7326bf..c452d5b12ce01 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -15,10 +15,9 @@ def test_parse_time_string(): - (date, parsed, reso) = parse_time_string("4Q1984") - (date_lower, parsed_lower, reso_lower) = parse_time_string("4q1984") + (parsed, reso) = parse_time_string("4Q1984") + (parsed_lower, reso_lower) = parse_time_string("4q1984") - assert date == date_lower assert reso == reso_lower assert parsed == parsed_lower @@ -34,10 +33,9 @@ def test_parse_time_string_invalid_type(): ) def test_parse_time_quarter_with_dash(dashed, normal): # see gh-9688 - (date_dash, parsed_dash, reso_dash) = parse_time_string(dashed) - (date, parsed, reso) = parse_time_string(normal) + (parsed_dash, reso_dash) = parse_time_string(dashed) + (parsed, reso) = parse_time_string(normal) - assert date_dash == date assert parsed_dash == parsed assert reso_dash == reso @@ -106,7 +104,7 @@ def test_parsers_quarterly_with_freq_error(date_str, kwargs, msg): ], ) def test_parsers_quarterly_with_freq(date_str, freq, expected): - result, _, _ = parsing.parse_time_string(date_str, freq=freq) + result, _ = parsing.parse_time_string(date_str, freq=freq) assert result == expected @@ -131,7 +129,7 @@ def test_parsers_quarter_invalid(date_str): [("201101", datetime(2011, 1, 1, 0, 0)), ("200005", datetime(2000, 5, 1, 0, 0))], ) def test_parsers_month_freq(date_str, expected): - result, _, _ = parsing.parse_time_string(date_str, freq="M") + result, _ = parsing.parse_time_string(date_str, freq="M") assert result == expected @@ -223,5 +221,5 @@ def test_parse_time_string_check_instance_type_raise_exception(): parse_time_string((1, 2, 3)) result = parse_time_string("2019") - expected = (datetime(2019, 1, 1), datetime(2019, 1, 1), "year") + expected = (datetime(2019, 1, 1), "year") assert result == expected From 22fe19c1408c5f3ac0f353a2c3df53260b2520bf Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Thu, 16 Jan 2020 12:26:00 +0200 Subject: [PATCH 065/158] TYP/CLN: Replaced "Optional[Hashable]" with "Label" from pandas._typing (#31062) --- pandas/core/generic.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0c413cd473bbc..b73f129fbda8e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -37,6 +37,7 @@ FilePathOrBuffer, FrameOrSeries, JSONSerializable, + Label, Level, Renamer, ) @@ -3009,10 +3010,10 @@ def to_csv( sep: str = ",", na_rep: str = "", float_format: Optional[str] = None, - columns: Optional[Sequence[Optional[Hashable]]] = None, + columns: Optional[Sequence[Label]] = None, header: Union[bool_t, List[str]] = True, index: bool_t = True, - index_label: Optional[Union[bool_t, str, Sequence[Optional[Hashable]]]] = None, + index_label: Optional[Union[bool_t, str, Sequence[Label]]] = None, mode: str = "w", encoding: Optional[str] = None, compression: Optional[Union[str, Mapping[str, str]]] = "infer", From 7eafd735806cdf3dc5acaac99bb0155da52a3138 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Thu, 16 Jan 2020 18:19:00 +0200 Subject: [PATCH 066/158] CLN: Removed unused varibles from for loops (#31059) --- pandas/io/sas/sas.pyx | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index 211935009d2e5..b4f8eeb3d226d 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -33,7 +33,7 @@ cdef const uint8_t[:] rle_decompress(int result_length, raise ValueError("Unexpected non-zero end_of_first_byte") nbytes = (inbuff[ipos]) + 64 ipos += 1 - for i in range(nbytes): + for _ in range(nbytes): result[rpos] = inbuff[ipos] rpos += 1 ipos += 1 @@ -42,20 +42,20 @@ cdef const uint8_t[:] rle_decompress(int result_length, nbytes = end_of_first_byte * 16 nbytes += (inbuff[ipos]) ipos += 1 - for i in range(nbytes): + for _ in range(nbytes): result[rpos] = inbuff[ipos] rpos += 1 ipos += 1 elif control_byte == 0x60: nbytes = end_of_first_byte * 256 + (inbuff[ipos]) + 17 ipos += 1 - for i in range(nbytes): + for _ in range(nbytes): result[rpos] = 0x20 rpos += 1 elif control_byte == 0x70: nbytes = end_of_first_byte * 256 + (inbuff[ipos]) + 17 ipos += 1 - for i in range(nbytes): + for _ in range(nbytes): result[rpos] = 0x00 rpos += 1 elif control_byte == 0x80: @@ -86,22 +86,22 @@ cdef const uint8_t[:] rle_decompress(int result_length, nbytes = end_of_first_byte + 3 x = inbuff[ipos] ipos += 1 - for i in range(nbytes): + for _ in range(nbytes): result[rpos] = x rpos += 1 elif control_byte == 0xD0: nbytes = end_of_first_byte + 2 - for i in range(nbytes): + for _ in range(nbytes): result[rpos] = 0x40 rpos += 1 elif control_byte == 0xE0: nbytes = end_of_first_byte + 2 - for i in range(nbytes): + for _ in range(nbytes): result[rpos] = 0x20 rpos += 1 elif control_byte == 0xF0: nbytes = end_of_first_byte + 2 - for i in range(nbytes): + for _ in range(nbytes): result[rpos] = 0x00 rpos += 1 else: @@ -289,7 +289,7 @@ cdef class Parser: bint done int i - for i in range(nrows): + for _ in range(nrows): done = self.readline() if done: break From 7a1dad130ce183a96f76f303120b688e998e129c Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Thu, 16 Jan 2020 20:34:42 +0200 Subject: [PATCH 067/158] CLN/STY: Some stuff that got the attention of my eye (#31076) --- pandas/_libs/hashtable.pyx | 60 ++++++++++++++++++---------- pandas/_libs/window/aggregations.pyx | 13 ++++-- pandas/io/sas/sas.pyx | 50 ++++++++++------------- pandas/tseries/offsets.py | 4 +- setup.py | 2 +- 5 files changed, 74 insertions(+), 55 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 59ba1705d2dbb..884db9ee931d4 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -13,26 +13,45 @@ cnp.import_array() cdef extern from "numpy/npy_math.h": float64_t NAN "NPY_NAN" - from pandas._libs.khash cimport ( khiter_t, - - kh_str_t, kh_init_str, kh_put_str, kh_exist_str, - kh_get_str, kh_destroy_str, kh_resize_str, - - kh_put_strbox, kh_get_strbox, kh_init_strbox, - - kh_int64_t, kh_init_int64, kh_resize_int64, kh_destroy_int64, - kh_get_int64, kh_exist_int64, kh_put_int64, - - kh_float64_t, kh_exist_float64, kh_put_float64, kh_init_float64, - kh_get_float64, kh_destroy_float64, kh_resize_float64, - - kh_resize_uint64, kh_exist_uint64, kh_destroy_uint64, kh_put_uint64, - kh_get_uint64, kh_init_uint64, - - kh_destroy_pymap, kh_exist_pymap, kh_init_pymap, kh_get_pymap, - kh_put_pymap, kh_resize_pymap) + kh_str_t, + kh_init_str, + kh_put_str, + kh_exist_str, + kh_get_str, + kh_destroy_str, + kh_resize_str, + kh_put_strbox, + kh_get_strbox, + kh_init_strbox, + kh_int64_t, + kh_init_int64, + kh_resize_int64, + kh_destroy_int64, + kh_get_int64, + kh_exist_int64, + kh_put_int64, + kh_float64_t, + kh_exist_float64, + kh_put_float64, + kh_init_float64, + kh_get_float64, + kh_destroy_float64, + kh_resize_float64, + kh_resize_uint64, + kh_exist_uint64, + kh_destroy_uint64, + kh_put_uint64, + kh_get_uint64, + kh_init_uint64, + kh_destroy_pymap, + kh_exist_pymap, + kh_init_pymap, + kh_get_pymap, + kh_put_pymap, + kh_resize_pymap, +) cimport pandas._libs.util as util @@ -63,8 +82,9 @@ cdef class Factorizer: def get_count(self): return self.count - def factorize(self, ndarray[object] values, sort=False, na_sentinel=-1, - na_value=None): + def factorize( + self, ndarray[object] values, sort=False, na_sentinel=-1, na_value=None + ): """ Factorize values with nans replaced by na_sentinel >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index fe74d701ef00f..f675818599b2c 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -56,8 +56,9 @@ cdef: cdef inline int int_max(int a, int b): return a if a >= b else b cdef inline int int_min(int a, int b): return a if a <= b else b -cdef inline bint is_monotonic_start_end_bounds(ndarray[int64_t, ndim=1] start, - ndarray[int64_t, ndim=1] end): +cdef inline bint is_monotonic_start_end_bounds( + ndarray[int64_t, ndim=1] start, ndarray[int64_t, ndim=1] end +): return is_monotonic(start, False)[0] and is_monotonic(end, False)[0] # Cython implementations of rolling sum, mean, variance, skewness, @@ -90,8 +91,12 @@ cdef inline bint is_monotonic_start_end_bounds(ndarray[int64_t, ndim=1] start, # this is only an impl for index not None, IOW, freq aware -def roll_count(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, - int64_t minp): +def roll_count( + ndarray[float64_t] values, + ndarray[int64_t] start, + ndarray[int64_t] end, + int64_t minp, +): cdef: float64_t val, count_x = 0.0 int64_t s, e, nobs, N = len(values) diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index b4f8eeb3d226d..40fea0aaf0d07 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -13,8 +13,7 @@ ctypedef unsigned short uint16_t # algorithm. It is partially documented here: # # https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf -cdef const uint8_t[:] rle_decompress(int result_length, - const uint8_t[:] inbuff): +cdef const uint8_t[:] rle_decompress(int result_length, const uint8_t[:] inbuff): cdef: uint8_t control_byte, x @@ -117,8 +116,7 @@ cdef const uint8_t[:] rle_decompress(int result_length, # rdc_decompress decompresses data using the Ross Data Compression algorithm: # # http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm -cdef const uint8_t[:] rdc_decompress(int result_length, - const uint8_t[:] inbuff): +cdef const uint8_t[:] rdc_decompress(int result_length, const uint8_t[:] inbuff): cdef: uint8_t cmd @@ -233,8 +231,7 @@ cdef class Parser: int subheader_pointer_length int current_page_type bint is_little_endian - const uint8_t[:] (*decompress)(int result_length, - const uint8_t[:] inbuff) + const uint8_t[:] (*decompress)(int result_length, const uint8_t[:] inbuff) object parser def __init__(self, object parser): @@ -267,9 +264,7 @@ cdef class Parser: elif column_types[j] == b's': self.column_types[j] = column_type_string else: - raise ValueError( - f"unknown column type: {self.parser.columns[j].ctype}" - ) + raise ValueError(f"unknown column type: {self.parser.columns[j].ctype}") # compression if parser.compression == const.rle_compression: @@ -296,8 +291,7 @@ cdef class Parser: # update the parser self.parser._current_row_on_page_index = self.current_row_on_page_index - self.parser._current_row_in_chunk_index =\ - self.current_row_in_chunk_index + self.parser._current_row_in_chunk_index = self.current_row_in_chunk_index self.parser._current_row_in_file_index = self.current_row_in_file_index cdef bint read_next_page(self): @@ -318,9 +312,9 @@ cdef class Parser: self.current_page_type = self.parser._current_page_type self.current_page_block_count = self.parser._current_page_block_count self.current_page_data_subheader_pointers_len = len( - self.parser._current_page_data_subheader_pointers) - self.current_page_subheaders_count =\ - self.parser._current_page_subheaders_count + self.parser._current_page_data_subheader_pointers + ) + self.current_page_subheaders_count = self.parser._current_page_subheaders_count cdef readline(self): @@ -358,19 +352,18 @@ cdef class Parser: return False elif (self.current_page_type == page_mix_types_0 or self.current_page_type == page_mix_types_1): - align_correction = (bit_offset + subheader_pointers_offset + - self.current_page_subheaders_count * - subheader_pointer_length) + align_correction = ( + bit_offset + + subheader_pointers_offset + + self.current_page_subheaders_count * subheader_pointer_length + ) align_correction = align_correction % 8 offset = bit_offset + align_correction offset += subheader_pointers_offset - offset += (self.current_page_subheaders_count * - subheader_pointer_length) + offset += self.current_page_subheaders_count * subheader_pointer_length offset += self.current_row_on_page_index * self.row_length - self.process_byte_array_with_data(offset, - self.row_length) - mn = min(self.parser.row_count, - self.parser._mix_page_row_count) + self.process_byte_array_with_data(offset, self.row_length) + mn = min(self.parser.row_count, self.parser._mix_page_row_count) if self.current_row_on_page_index == mn: done = self.read_next_page() if done: @@ -378,11 +371,12 @@ cdef class Parser: return False elif self.current_page_type & page_data_type == page_data_type: self.process_byte_array_with_data( - bit_offset + subheader_pointers_offset + - self.current_row_on_page_index * self.row_length, - self.row_length) - flag = (self.current_row_on_page_index == - self.current_page_block_count) + bit_offset + + subheader_pointers_offset + + self.current_row_on_page_index * self.row_length, + self.row_length, + ) + flag = self.current_row_on_page_index == self.current_page_block_count if flag: done = self.read_next_page() if done: diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index d31c23c7ccf1d..2808d74e68d4e 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -2667,8 +2667,8 @@ def _delta_to_tick(delta: timedelta) -> Tick: return Second(seconds) else: nanos = delta_to_nanoseconds(delta) - if nanos % 1000000 == 0: - return Milli(nanos // 1000000) + if nanos % 1_000_000 == 0: + return Milli(nanos // 1_000_000) elif nanos % 1000 == 0: return Micro(nanos // 1000) else: # pragma: no cover diff --git a/setup.py b/setup.py index c33ce063cb4d9..6635b58cd7103 100755 --- a/setup.py +++ b/setup.py @@ -356,7 +356,7 @@ def run(self): sourcefile = pyxfile[:-3] + extension msg = ( f"{extension}-source file '{sourcefile}' not found.\n" - f"Run 'setup.py cython' before sdist." + "Run 'setup.py cython' before sdist." ) assert os.path.isfile(sourcefile), msg sdist_class.run(self) From 7b7bb83d3ddce3c296fc4ee880fa5a96fdd93545 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Thu, 16 Jan 2020 21:49:50 +0200 Subject: [PATCH 068/158] CLN: Removed the same code in 'if' and the 'else' (#31081) --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 6635b58cd7103..86fe62202c643 100755 --- a/setup.py +++ b/setup.py @@ -412,15 +412,14 @@ def run(self): cmdclass.update({"clean": CleanCommand, "build": build}) +cmdclass["build_ext"] = CheckingBuildExt if cython: suffix = ".pyx" - cmdclass["build_ext"] = CheckingBuildExt cmdclass["cython"] = CythonCommand else: suffix = ".c" cmdclass["build_src"] = DummyBuildSrc - cmdclass["build_ext"] = CheckingBuildExt # ---------------------------------------------------------------------- # Preparation of compiler arguments From 721a2eb268b71168ad962ff818b19db15f4133ca Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 16 Jan 2020 11:55:20 -0800 Subject: [PATCH 069/158] CLN: remove unnecessary arg from _to_dt64 (#30895) --- pandas/_libs/tslibs/offsets.pyx | 8 ++++---- pandas/tseries/offsets.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index f24dce28cd5f7..31dc2945f0395 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -216,7 +216,7 @@ def _get_calendar(weekmask, holidays, calendar): holidays = holidays + calendar.holidays().tolist() except AttributeError: pass - holidays = [_to_dt64(dt, dtype='datetime64[D]') for dt in holidays] + holidays = [_to_dt64D(dt) for dt in holidays] holidays = tuple(sorted(holidays)) kwargs = {'weekmask': weekmask} @@ -227,7 +227,7 @@ def _get_calendar(weekmask, holidays, calendar): return busdaycalendar, holidays -def _to_dt64(dt, dtype='datetime64'): +def _to_dt64D(dt): # Currently # > np.datetime64(dt.datetime(2013,5,1),dtype='datetime64[D]') # numpy.datetime64('2013-05-01T02:00:00.000000+0200') @@ -238,8 +238,8 @@ def _to_dt64(dt, dtype='datetime64'): dt = np.int64(dt).astype('datetime64[ns]') else: dt = np.datetime64(dt) - if dt.dtype.name != dtype: - dt = dt.astype(dtype) + if dt.dtype.name != "datetime64[D]": + dt = dt.astype("datetime64[D]") return dt diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 2808d74e68d4e..220ff241efa0c 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -26,7 +26,7 @@ BaseOffset, _get_calendar, _is_normalized, - _to_dt64, + _to_dt64D, apply_index_wraps, as_datetime, roll_yearday, @@ -1090,7 +1090,7 @@ def apply_index(self, i): def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False - day64 = _to_dt64(dt, "datetime64[D]") + day64 = _to_dt64D(dt) return np.is_busday(day64, busdaycal=self.calendar) From a8eaebfed39a13588e590cdb8c12f16b3141a147 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 16 Jan 2020 11:56:29 -0800 Subject: [PATCH 070/158] CLN: simplify Float64Index.__contains__ (#30899) --- pandas/core/indexes/numeric.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 9a3a021bd801a..53f96ace890fb 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -477,20 +477,7 @@ def __contains__(self, other) -> bool: if super().__contains__(other): return True - try: - # if other is a sequence this throws a ValueError - return np.isnan(other) and self.hasnans - except ValueError: - try: - return len(other) <= 1 and other.item() in self - except AttributeError: - return len(other) <= 1 and other in self - except TypeError: - pass - except TypeError: - pass - - return False + return is_float(other) and np.isnan(other) and self.hasnans @Appender(_index_shared_docs["get_loc"]) def get_loc(self, key, method=None, tolerance=None): From 208bb41c3aa7b59009556c8f4c039da25f0ddb2d Mon Sep 17 00:00:00 2001 From: Peter Bull Date: Thu, 16 Jan 2020 12:00:04 -0800 Subject: [PATCH 071/158] DOC: Add pandas_path to the accesor list in the documentation (#30898) --- doc/source/ecosystem.rst | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 7bd5ba7ecdf0b..be61b83d46a26 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -41,6 +41,16 @@ Pyjanitor provides a clean API for cleaning data, using method chaining. Engarde is a lightweight library used to explicitly state assumptions about your datasets and check that they're *actually* true. +`pandas-path `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Since Python 3.4, `pathlib `_ has been +included in the Python standard library. Path objects provide a simple +and delightful way to interact with the file system. The pandas-path package enables the +Path API for pandas through a custom accessor ``.path``. Getting just the filenames from +a series of full file paths is as simple as ``my_files.path.name``. Other convenient operations like +joining paths, replacing file extensions, and checking if files exist are also available. + .. _ecosystem.stats: Statistics and machine learning @@ -386,12 +396,16 @@ A directory of projects providing :ref:`extension accessors `. This is for users to discover new accessors and for library authors to coordinate on the namespace. -============== ========== ========================= -Library Accessor Classes -============== ========== ========================= -`cyberpandas`_ ``ip`` ``Series`` -`pdvega`_ ``vgplot`` ``Series``, ``DataFrame`` -============== ========== ========================= +=============== ========== ========================= =============================================================== +Library Accessor Classes Description +=============== ========== ========================= =============================================================== +`cyberpandas`_ ``ip`` ``Series`` Provides common operations for working with IP addresses. +`pdvega`_ ``vgplot`` ``Series``, ``DataFrame`` Provides plotting functions from the Altair_ library. +`pandas_path`_ ``path`` ``Index``, ``Series`` Provides `pathlib.Path`_ functions for Series. +=============== ========== ========================= =============================================================== .. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest .. _pdvega: https://altair-viz.github.io/pdvega/ +.. _Altair: https://altair-viz.github.io/ +.. _pandas_path: https://github.com/drivendataorg/pandas-path/ +.. _pathlib.Path: https://docs.python.org/3/library/pathlib.html \ No newline at end of file From 171a1ed095e482db71f7dd655cc8b16706968ee7 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Thu, 16 Jan 2020 22:30:54 +0200 Subject: [PATCH 072/158] TYP: _config/config.py && core/{apply,construction}.py (#30734) --- pandas/_config/config.py | 153 ++++++++++++++++++++---------------- pandas/core/apply.py | 9 ++- pandas/core/construction.py | 2 +- 3 files changed, 92 insertions(+), 72 deletions(-) diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 42df8a84a8c77..cacd6f5454de7 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -51,7 +51,18 @@ from collections import namedtuple from contextlib import contextmanager import re -from typing import Any, Dict, Iterable, List +from typing import ( + Any, + Callable, + Dict, + Iterable, + List, + Optional, + Tuple, + Type, + TypeVar, + cast, +) import warnings DeprecatedOption = namedtuple("DeprecatedOption", "key msg rkey removal_ver") @@ -80,7 +91,7 @@ class OptionError(AttributeError, KeyError): # User API -def _get_single_key(pat, silent): +def _get_single_key(pat: str, silent: bool) -> str: keys = _select_options(pat) if len(keys) == 0: if not silent: @@ -98,7 +109,7 @@ def _get_single_key(pat, silent): return key -def _get_option(pat, silent=False): +def _get_option(pat: str, silent: bool = False): key = _get_single_key(pat, silent) # walk the nested dict @@ -106,7 +117,7 @@ def _get_option(pat, silent=False): return root[k] -def _set_option(*args, **kwargs): +def _set_option(*args, **kwargs) -> None: # must at least 1 arg deal with constraints later nargs = len(args) if not nargs or nargs % 2 != 0: @@ -138,7 +149,7 @@ def _set_option(*args, **kwargs): o.cb(key) -def _describe_option(pat="", _print_desc=True): +def _describe_option(pat: str = "", _print_desc: bool = True): keys = _select_options(pat) if len(keys) == 0: @@ -154,7 +165,7 @@ def _describe_option(pat="", _print_desc=True): return s -def _reset_option(pat, silent=False): +def _reset_option(pat: str, silent: bool = False) -> None: keys = _select_options(pat) @@ -172,7 +183,7 @@ def _reset_option(pat, silent=False): _set_option(k, _registered_options[k].defval, silent=silent) -def get_default_val(pat): +def get_default_val(pat: str): key = _get_single_key(pat, silent=True) return _get_registered_option(key).defval @@ -180,11 +191,11 @@ def get_default_val(pat): class DictWrapper: """ provide attribute-style access to a nested dict""" - def __init__(self, d, prefix=""): + def __init__(self, d: Dict[str, Any], prefix: str = ""): object.__setattr__(self, "d", d) object.__setattr__(self, "prefix", prefix) - def __setattr__(self, key, val): + def __setattr__(self, key: str, val: Any) -> None: prefix = object.__getattribute__(self, "prefix") if prefix: prefix += "." @@ -210,7 +221,7 @@ def __getattr__(self, key: str): else: return _get_option(prefix) - def __dir__(self): + def __dir__(self) -> Iterable[str]: return list(self.d.keys()) @@ -411,23 +422,31 @@ def __exit__(self, *args): _set_option(pat, val, silent=True) -def register_option(key: str, defval: object, doc="", validator=None, cb=None): - """Register an option in the package-wide pandas config object +def register_option( + key: str, + defval: object, + doc: str = "", + validator: Optional[Callable[[Any], Any]] = None, + cb: Optional[Callable[[str], Any]] = None, +) -> None: + """ + Register an option in the package-wide pandas config object Parameters ---------- - key - a fully-qualified key, e.g. "x.y.option - z". - defval - the default value of the option - doc - a string description of the option - validator - a function of a single argument, should raise `ValueError` if - called with a value which is not a legal value for the option. - cb - a function of a single argument "key", which is called - immediately after an option value is set/reset. key is - the full name of the option. - - Returns - ------- - Nothing. + key : str + Fully-qualified key, e.g. "x.y.option - z". + defval : object + Default value of the option. + doc : str + Description of the option. + validator : Callable, optional + Function of a single argument, should raise `ValueError` if + called with a value which is not a legal value for the option. + cb + a function of a single argument "key", which is called + immediately after an option value is set/reset. key is + the full name of the option. Raises ------ @@ -480,7 +499,9 @@ def register_option(key: str, defval: object, doc="", validator=None, cb=None): ) -def deprecate_option(key, msg=None, rkey=None, removal_ver=None): +def deprecate_option( + key: str, msg: Optional[str] = None, rkey: Optional[str] = None, removal_ver=None +) -> None: """ Mark option `key` as deprecated, if code attempts to access this option, a warning will be produced, using `msg` if given, or a default message @@ -493,32 +514,27 @@ def deprecate_option(key, msg=None, rkey=None, removal_ver=None): Parameters ---------- - key - the name of the option to be deprecated. must be a fully-qualified - option name (e.g "x.y.z.rkey"). - - msg - (Optional) a warning message to output when the key is referenced. - if no message is given a default message will be emitted. - - rkey - (Optional) the name of an option to reroute access to. - If specified, any referenced `key` will be re-routed to `rkey` - including set/get/reset. - rkey must be a fully-qualified option name (e.g "x.y.z.rkey"). - used by the default message if no `msg` is specified. - - removal_ver - (Optional) specifies the version in which this option will - be removed. used by the default message if no `msg` - is specified. - - Returns - ------- - Nothing + key : str + Name of the option to be deprecated. + must be a fully-qualified option name (e.g "x.y.z.rkey"). + msg : str, optional + Warning message to output when the key is referenced. + if no message is given a default message will be emitted. + rkey : str, optional + Name of an option to reroute access to. + If specified, any referenced `key` will be + re-routed to `rkey` including set/get/reset. + rkey must be a fully-qualified option name (e.g "x.y.z.rkey"). + used by the default message if no `msg` is specified. + removal_ver : optional + Specifies the version in which this option will + be removed. used by the default message if no `msg` is specified. Raises ------ - OptionError - if key has already been deprecated. - + OptionError + If the specified key has already been deprecated. """ - key = key.lower() if key in _deprecated_options: @@ -531,7 +547,7 @@ def deprecate_option(key, msg=None, rkey=None, removal_ver=None): # functions internal to the module -def _select_options(pat): +def _select_options(pat: str) -> List[str]: """returns a list of keys matching `pat` if pat=="all", returns all registered options @@ -549,7 +565,7 @@ def _select_options(pat): return [k for k in keys if re.search(pat, k, re.I)] -def _get_root(key): +def _get_root(key: str) -> Tuple[Dict[str, Any], str]: path = key.split(".") cursor = _global_config for p in path[:-1]: @@ -557,14 +573,14 @@ def _get_root(key): return cursor, path[-1] -def _is_deprecated(key): +def _is_deprecated(key: str) -> bool: """ Returns True if the given option has been deprecated """ key = key.lower() return key in _deprecated_options -def _get_deprecated_option(key): +def _get_deprecated_option(key: str): """ Retrieves the metadata for a deprecated option, if `key` is deprecated. @@ -581,7 +597,7 @@ def _get_deprecated_option(key): return d -def _get_registered_option(key): +def _get_registered_option(key: str): """ Retrieves the option metadata if `key` is a registered option. @@ -592,7 +608,7 @@ def _get_registered_option(key): return _registered_options.get(key) -def _translate_key(key): +def _translate_key(key: str) -> str: """ if key id deprecated and a replacement key defined, will return the replacement key, otherwise returns `key` as - is @@ -605,7 +621,7 @@ def _translate_key(key): return key -def _warn_if_deprecated(key): +def _warn_if_deprecated(key: str) -> bool: """ Checks if `key` is a deprecated option and if so, prints a warning. @@ -633,7 +649,7 @@ def _warn_if_deprecated(key): return False -def _build_option_description(k): +def _build_option_description(k: str) -> str: """ Builds a formatted description of a registered option and prints it """ o = _get_registered_option(k) @@ -658,7 +674,7 @@ def _build_option_description(k): return s -def pp_options_list(keys, width=80, _print=False): +def pp_options_list(keys: Iterable[str], width=80, _print: bool = False): """ Builds a concise listing of available options, grouped by prefix """ from textwrap import wrap @@ -696,6 +712,9 @@ def pp(name: str, ks: Iterable[str]) -> List[str]: # # helpers +FuncType = Callable[..., Any] +F = TypeVar("F", bound=FuncType) + @contextmanager def config_prefix(prefix): @@ -727,12 +746,12 @@ def config_prefix(prefix): global register_option, get_option, set_option, reset_option - def wrap(func): - def inner(key, *args, **kwds): + def wrap(func: F) -> F: + def inner(key: str, *args, **kwds): pkey = f"{prefix}.{key}" return func(pkey, *args, **kwds) - return inner + return cast(F, inner) _register_option = register_option _get_option = get_option @@ -750,7 +769,7 @@ def inner(key, *args, **kwds): # arg in register_option -def is_type_factory(_type): +def is_type_factory(_type: Type[Any]) -> Callable[[Any], None]: """ Parameters @@ -764,14 +783,14 @@ def is_type_factory(_type): """ - def inner(x): + def inner(x) -> None: if type(x) != _type: raise ValueError(f"Value must have type '{_type}'") return inner -def is_instance_factory(_type): +def is_instance_factory(_type) -> Callable[[Any], None]: """ Parameters @@ -791,19 +810,19 @@ def is_instance_factory(_type): else: type_repr = f"'{_type}'" - def inner(x): + def inner(x) -> None: if not isinstance(x, _type): raise ValueError(f"Value must be an instance of {type_repr}") return inner -def is_one_of_factory(legal_values): +def is_one_of_factory(legal_values) -> Callable[[Any], None]: callables = [c for c in legal_values if callable(c)] legal_values = [c for c in legal_values if not callable(c)] - def inner(x): + def inner(x) -> None: if x not in legal_values: if not any(c(x) for c in callables): @@ -817,7 +836,7 @@ def inner(x): return inner -def is_nonnegative_int(value): +def is_nonnegative_int(value: Optional[int]) -> None: """ Verify that value is None or a positive int. @@ -852,7 +871,7 @@ def is_nonnegative_int(value): is_text = is_instance_factory((str, bytes)) -def is_callable(obj): +def is_callable(obj) -> bool: """ Parameters diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 14a3c3c008e92..ca1be3154757a 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1,10 +1,11 @@ import abc import inspect -from typing import TYPE_CHECKING, Any, Dict, Iterator, Tuple, Type, Union +from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional, Tuple, Type, Union import numpy as np from pandas._libs import reduction as libreduction +from pandas._typing import Axis from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( @@ -26,9 +27,9 @@ def frame_apply( obj: "DataFrame", func, - axis=0, + axis: Axis = 0, raw: bool = False, - result_type=None, + result_type: Optional[str] = None, ignore_failures: bool = False, args=None, kwds=None, @@ -87,7 +88,7 @@ def __init__( obj: "DataFrame", func, raw: bool, - result_type, + result_type: Optional[str], ignore_failures: bool, args, kwds, diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 203ef3ec75c8f..f947a1fda49f1 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -334,7 +334,7 @@ def array( return result -def extract_array(obj, extract_numpy=False): +def extract_array(obj, extract_numpy: bool = False): """ Extract the ndarray or ExtensionArray from a Series or Index. From e4a96bbfcf038c0db349fbb6bf1db6aa2911f12e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 16 Jan 2020 13:15:38 -0800 Subject: [PATCH 073/158] ENH: Add engine keyword to expanding.apply to utilize Numba (#30937) --- asv_bench/asv.conf.json | 1 + asv_bench/benchmarks/rolling.py | 21 ++++++++++++++ doc/source/user_guide/computation.rst | 1 + doc/source/whatsnew/v1.0.0.rst | 12 ++++---- pandas/core/window/expanding.py | 20 +++++++++++-- pandas/core/window/rolling.py | 2 +- .../window/moments/test_moments_expanding.py | 29 +++++++++++++------ 7 files changed, 68 insertions(+), 18 deletions(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index cd1a31d4eaf34..7886b63e9983e 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -43,6 +43,7 @@ "matplotlib": [], "sqlalchemy": [], "scipy": [], + "numba": [], "numexpr": [], "pytables": [null, ""], // platform dependent, see excludes below "tables": [null, ""], diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 7a72622fd5fe3..f7e1e395a76bc 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -44,6 +44,27 @@ def time_rolling(self, constructor, window, dtype, function, raw): self.roll.apply(function, raw=raw) +class Engine: + params = ( + ["DataFrame", "Series"], + ["int", "float"], + [np.sum, lambda x: np.sum(x) + 5], + ["cython", "numba"], + ) + param_names = ["constructor", "dtype", "function", "engine"] + + def setup(self, constructor, dtype, function, engine): + N = 10 ** 3 + arr = (100 * np.random.random(N)).astype(dtype) + self.data = getattr(pd, constructor)(arr) + + def time_rolling_apply(self, constructor, dtype, function, engine): + self.data.rolling(10).apply(function, raw=True, engine=engine) + + def time_expanding_apply(self, constructor, dtype, function, engine): + self.data.expanding().apply(function, raw=True, engine=engine) + + class ExpandingMethods: params = ( diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index a2150c207c0b0..aeb32db639ffb 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -348,6 +348,7 @@ Numba will be applied in potentially two routines: 1. If ``func`` is a standard Python function, the engine will `JIT `__ the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again. + 2. The engine will JIT the for loop where the apply function is applied to each window. The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index c423933d4c438..fa562838c8f7c 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -159,14 +159,14 @@ You can use the alias ``"boolean"`` as well. .. _whatsnew_100.numba_rolling_apply: -Using Numba in ``rolling.apply`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Using Numba in ``rolling.apply`` and ``expanding.apply`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -We've added an ``engine`` keyword to :meth:`~core.window.rolling.Rolling.apply` that allows the user to execute the -routine using `Numba `__ instead of Cython. Using the Numba engine -can yield significant performance gains if the apply function can operate on numpy arrays and +We've added an ``engine`` keyword to :meth:`~core.window.rolling.Rolling.apply` and :meth:`~core.window.expanding.Expanding.apply` +that allows the user to execute the routine using `Numba `__ instead of Cython. +Using the Numba engine can yield significant performance gains if the apply function can operate on numpy arrays and the data set is larger (1 million rows or greater). For more details, see -:ref:`rolling apply documentation ` (:issue:`28987`) +:ref:`rolling apply documentation ` (:issue:`28987`, :issue:`30936`) .. _whatsnew_100.custom_window: diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 68c3514308cbc..a0bf3376d2352 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -1,4 +1,5 @@ from textwrap import dedent +from typing import Dict, Optional from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution @@ -148,8 +149,23 @@ def count(self, **kwargs): @Substitution(name="expanding") @Appender(_shared_docs["apply"]) - def apply(self, func, raw=False, args=(), kwargs={}): - return super().apply(func, raw=raw, args=args, kwargs=kwargs) + def apply( + self, + func, + raw: bool = False, + engine: str = "cython", + engine_kwargs: Optional[Dict[str, bool]] = None, + args=None, + kwargs=None, + ): + return super().apply( + func, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + args=args, + kwargs=kwargs, + ) @Substitution(name="expanding") @Appender(_shared_docs["sum"]) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index bdc94c7402eb5..f7efa69778c44 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1203,7 +1203,7 @@ def count(self): _shared_docs["apply"] = dedent( r""" - The %(name)s function's apply function. + Apply an arbitrary function to each %(name)s window. Parameters ---------- diff --git a/pandas/tests/window/moments/test_moments_expanding.py b/pandas/tests/window/moments/test_moments_expanding.py index 4596552d8f255..322082187f531 100644 --- a/pandas/tests/window/moments/test_moments_expanding.py +++ b/pandas/tests/window/moments/test_moments_expanding.py @@ -13,15 +13,17 @@ class TestExpandingMomentsConsistency(ConsistencyBase): def setup_method(self, method): self._create_data() - def test_expanding_apply_args_kwargs(self, raw): + def test_expanding_apply_args_kwargs(self, engine_and_raw): def mean_w_arg(x, const): return np.mean(x) + const + engine, raw = engine_and_raw + df = DataFrame(np.random.rand(20, 3)) - expected = df.expanding().apply(np.mean, raw=raw) + 20.0 + expected = df.expanding().apply(np.mean, engine=engine, raw=raw) + 20.0 - result = df.expanding().apply(mean_w_arg, raw=raw, args=(20,)) + result = df.expanding().apply(mean_w_arg, engine=engine, raw=raw, args=(20,)) tm.assert_frame_equal(result, expected) result = df.expanding().apply(mean_w_arg, raw=raw, kwargs={"const": 20}) @@ -190,11 +192,14 @@ def expanding_func(x, min_periods=1, center=False, axis=0): ) @pytest.mark.parametrize("has_min_periods", [True, False]) - def test_expanding_apply(self, raw, has_min_periods): + def test_expanding_apply(self, engine_and_raw, has_min_periods): + + engine, raw = engine_and_raw + def expanding_mean(x, min_periods=1): exp = x.expanding(min_periods=min_periods) - result = exp.apply(lambda x: x.mean(), raw=raw) + result = exp.apply(lambda x: x.mean(), raw=raw, engine=engine) return result # TODO(jreback), needed to add preserve_nan=False @@ -202,14 +207,20 @@ def expanding_mean(x, min_periods=1): self._check_expanding(expanding_mean, np.mean, preserve_nan=False) self._check_expanding_has_min_periods(expanding_mean, np.mean, has_min_periods) - def test_expanding_apply_empty_series(self, raw): + def test_expanding_apply_empty_series(self, engine_and_raw): + engine, raw = engine_and_raw ser = Series([], dtype=np.float64) - tm.assert_series_equal(ser, ser.expanding().apply(lambda x: x.mean(), raw=raw)) + tm.assert_series_equal( + ser, ser.expanding().apply(lambda x: x.mean(), raw=raw, engine=engine) + ) - def test_expanding_apply_min_periods_0(self, raw): + def test_expanding_apply_min_periods_0(self, engine_and_raw): # GH 8080 + engine, raw = engine_and_raw s = Series([None, None, None]) - result = s.expanding(min_periods=0).apply(lambda x: len(x), raw=raw) + result = s.expanding(min_periods=0).apply( + lambda x: len(x), raw=raw, engine=engine + ) expected = Series([1.0, 2.0, 3.0]) tm.assert_series_equal(result, expected) From 321bae66ff7a8c60feccc087cf07cd6d985278da Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Thu, 16 Jan 2020 23:36:53 +0200 Subject: [PATCH 074/158] automatic 'end' year of copyright (#31085) --- doc/source/conf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index c6786a03f0e44..7f24d02a496e1 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -10,6 +10,7 @@ # All configuration values have a default; values that are commented out # serve to show the default. +from datetime import datetime import importlib import inspect import logging @@ -137,7 +138,7 @@ # General information about the project. project = "pandas" -copyright = "2008-2020, the pandas development team" +copyright = f"2008-{datetime.now().year}, the pandas development team" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the From b2c90cc871caab5c96f6bb97049661d28dc8092d Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Thu, 16 Jan 2020 21:46:28 +0000 Subject: [PATCH 075/158] CLN: Remove download_wheels.py, moved to pandas-release (#31083) --- scripts/download_wheels.py | 47 -------------------------------------- 1 file changed, 47 deletions(-) delete mode 100644 scripts/download_wheels.py diff --git a/scripts/download_wheels.py b/scripts/download_wheels.py deleted file mode 100644 index 3d36eed2d888a..0000000000000 --- a/scripts/download_wheels.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python -"""Fetch wheels from wheels.scipy.org for a pandas version.""" -import argparse -import pathlib -import sys -import urllib.parse -import urllib.request - -from lxml import html - - -def parse_args(args=None): - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("version", type=str, help="Pandas version (0.23.0)") - return parser.parse_args(args) - - -def fetch(version): - base = "http://wheels.scipy.org" - tree = html.parse(base) - root = tree.getroot() - - dest = pathlib.Path("dist") - dest.mkdir(exist_ok=True) - - files = [ - x - for x in root.xpath("//a/text()") - if x.startswith(f"pandas-{version}") and not dest.joinpath(x).exists() - ] - - N = len(files) - - for i, filename in enumerate(files, 1): - out = str(dest.joinpath(filename)) - link = urllib.request.urljoin(base, filename) - urllib.request.urlretrieve(link, out) - print(f"Downloaded {link} to {out} [{i}/{N}]") - - -def main(args=None): - args = parse_args(args) - fetch(args.version) - - -if __name__ == "__main__": - sys.exit(main()) From 14ab82eba54a7369698020aae6487ea96f7ac20e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 16 Jan 2020 15:50:40 -0800 Subject: [PATCH 076/158] CLN: remove unused legacy pickle compat code (#31078) --- pandas/core/generic.py | 4 ++-- pandas/core/internals/managers.py | 25 +------------------------ pandas/core/series.py | 29 ----------------------------- 3 files changed, 3 insertions(+), 55 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b73f129fbda8e..6332ff45c59d0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1928,9 +1928,9 @@ def __setstate__(self, state): object.__setattr__(self, k, v) else: - self._unpickle_series_compat(state) + raise NotImplementedError("Pre-0.12 pickles are no longer supported") elif len(state) == 2: - self._unpickle_series_compat(state) + raise NotImplementedError("Pre-0.12 pickles are no longer supported") self._item_cache = {} diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 1fce2594062d5..24cc551ad0e45 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -279,30 +279,7 @@ def unpickle_block(values, mgr_locs): unpickle_block(b["values"], b["mgr_locs"]) for b in state["blocks"] ) else: - # discard anything after 3rd, support beta pickling format for a - # little while longer - ax_arrays, bvalues, bitems = state[:3] - - self.axes = [ensure_index(ax) for ax in ax_arrays] - - if len(bitems) == 1 and self.axes[0].equals(bitems[0]): - # This is a workaround for pre-0.14.1 pickles that didn't - # support unpickling multi-block frames/panels with non-unique - # columns/items, because given a manager with items ["a", "b", - # "a"] there's no way of knowing which block's "a" is where. - # - # Single-block case can be supported under the assumption that - # block items corresponded to manager items 1-to-1. - all_mgr_locs = [slice(0, len(bitems[0]))] - else: - all_mgr_locs = [ - self.axes[0].get_indexer(blk_items) for blk_items in bitems - ] - - self.blocks = tuple( - unpickle_block(values, mgr_locs) - for values, mgr_locs in zip(bvalues, all_mgr_locs) - ) + raise NotImplementedError("pre-0.14.1 pickles are no longer supported") self._post_setstate() diff --git a/pandas/core/series.py b/pandas/core/series.py index 01b68550391e6..22b347c39fc54 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -744,35 +744,6 @@ def __array__(self, dtype=None) -> np.ndarray: # ---------------------------------------------------------------------- - def _unpickle_series_compat(self, state) -> None: - if isinstance(state, dict): - self._data = state["_data"] - self.name = state["name"] - self.index = self._data.index - - elif isinstance(state, tuple): - - # < 0.12 series pickle - - nd_state, own_state = state - - # recreate the ndarray - data = np.empty(nd_state[1], dtype=nd_state[2]) - np.ndarray.__setstate__(data, nd_state) - - # backwards compat - index, name = own_state[0], None - if len(own_state) > 1: - name = own_state[1] - - # recreate - self._data = SingleBlockManager(data, index, fastpath=True) - self._index = index - self.name = name - - else: - raise Exception(f"cannot unpickle legacy formats -> [{state}]") - # indexers @property def axes(self) -> List[Index]: From f6747b9c3de9cec70d7918f6a18e5fce0ea81a2d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 16 Jan 2020 15:55:08 -0800 Subject: [PATCH 077/158] REF: stricter types for RangeIndex._simple_new (#31084) --- pandas/core/indexes/base.py | 6 +++++- pandas/core/indexes/range.py | 19 ++++++++----------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 47daaa4958411..22a0097c6b95f 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -57,6 +57,7 @@ ABCMultiIndex, ABCPandasArray, ABCPeriodIndex, + ABCRangeIndex, ABCSeries, ABCTimedeltaIndex, ) @@ -3105,7 +3106,10 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): if not isinstance(target, Index) and len(target) == 0: attrs = self._get_attributes_dict() attrs.pop("freq", None) # don't preserve freq - values = self._data[:0] # appropriately-dtyped empty array + if isinstance(self, ABCRangeIndex): + values = range(0) + else: + values = self._data[:0] # appropriately-dtyped empty array target = self._simple_new(values, dtype=self.dtype, **attrs) else: target = ensure_index(target) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 5c79942efb908..1629396796b85 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -27,12 +27,14 @@ import pandas.core.common as com from pandas.core.construction import extract_array import pandas.core.indexes.base as ibase -from pandas.core.indexes.base import Index, _index_shared_docs, maybe_extract_name +from pandas.core.indexes.base import _index_shared_docs, maybe_extract_name from pandas.core.indexes.numeric import Int64Index from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.io.formats.printing import pprint_thing +_empty_range = range(0) + class RangeIndex(Int64Index): """ @@ -130,15 +132,10 @@ def from_range(cls, data, name=None, dtype=None): return cls._simple_new(data, dtype=dtype, name=name) @classmethod - def _simple_new(cls, values, name=None, dtype=None): + def _simple_new(cls, values: range, name=None, dtype=None) -> "RangeIndex": result = object.__new__(cls) - # handle passed None, non-integers - if values is None: - # empty - values = range(0, 0, 1) - elif not isinstance(values, range): - return Index(values, dtype=dtype, name=name) + assert isinstance(values, range) result._range = values result.name = name @@ -482,7 +479,7 @@ def intersection(self, other, sort=False): return super().intersection(other, sort=sort) if not len(self) or not len(other): - return self._simple_new(None) + return self._simple_new(_empty_range) first = self._range[::-1] if self.step < 0 else self._range second = other._range[::-1] if other.step < 0 else other._range @@ -492,7 +489,7 @@ def intersection(self, other, sort=False): int_low = max(first.start, second.start) int_high = min(first.stop, second.stop) if int_high <= int_low: - return self._simple_new(None) + return self._simple_new(_empty_range) # Method hint: linear Diophantine equation # solve intersection problem @@ -502,7 +499,7 @@ def intersection(self, other, sort=False): # check whether element sets intersect if (first.start - second.start) % gcd: - return self._simple_new(None) + return self._simple_new(_empty_range) # calculate parameters for the RangeIndex describing the # intersection disregarding the lower bounds From b68a9bba9240a84c2aec40cc5669a1077a40e609 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Fri, 17 Jan 2020 00:25:00 +0000 Subject: [PATCH 078/158] WEB: Improving the sponsors in the web, and adding CZI (#31046) --- web/pandas/about/sponsors.md | 39 +++++++++++++++++++------- web/pandas/config.yml | 30 ++++++++++++++++---- web/pandas/index.html | 23 +++++++++------ web/pandas/static/img/partners/czi.svg | 38 +++++++++++++++++++++++++ web/pandas_web.py | 0 5 files changed, 106 insertions(+), 24 deletions(-) create mode 100644 web/pandas/static/img/partners/czi.svg mode change 100644 => 100755 web/pandas_web.py diff --git a/web/pandas/about/sponsors.md b/web/pandas/about/sponsors.md index dcc6e367e5d64..4473a16cfd590 100644 --- a/web/pandas/about/sponsors.md +++ b/web/pandas/about/sponsors.md @@ -11,31 +11,50 @@ health and sustainability of the project. Visit numfocus.org for more informatio Donations to _pandas_ are managed by NumFOCUS. For donors in the United States, your gift is tax-deductible to the extent provided by law. As with any donation, you should consult with your tax adviser about your particular tax situation. -## Tidelift +## Become a sponsor -_pandas_ is part of the [Tidelift subscription](https://tidelift.com/subscription/pkg/pypi-pandas?utm_source=pypi-pandas&utm_medium=referral&utm_campaign=readme). -You can support pandas by becoming a Tidelift subscriber. +As a free and open source project, _pandas_ relies on the support of the community of users for its development. +If you work for an organization that uses and benefits from _pandas_, please consider supporting pandas. There +are different ways, such as employing people to work on pandas, funding the project, or becoming a +[NumFOCUS sponsor](https://numfocus.org/sponsors) to support the broader ecosystem. Please contact us at +[admin@numfocus.org](mailto:admin@numfocus.org) to discuss. ## Institutional partners -Institutional Partners are companies and universities that support the project by employing contributors. -Current Institutional Partners include: +Institutional partners are companies and universities that support the project by employing contributors. +Current institutional partners include:
      - {% for company in partners.active if company.employs %} -
    • {{ company.name }} ({{ company.employs }})
    • + {% for company in sponsors.active if company.kind == "partner" %} +
    • {{ company.name }}: {{ company.description }}
    • + {% endfor %} +
    + +## Sponsors + +Sponsors are organizations that provide funding for pandas. Current sponsors include: + +
      + {% for company in sponsors.active if company.kind == "regular" %} +
    • {{ company.name }}: {{ company.description }}
    • {% endfor %}
    ## In-kind sponsors -- [OVH](https://us.ovhcloud.com/): Hosting -- [Indeed](https://opensource.indeedeng.io/): Logo and website design +In-kind sponsors are organizations that support pandas development with goods or services. +Current in-kind sponsors include: + +
      + {% for company in sponsors.inkind %} +
    • {{ company.name }}: {{ company.description }}
    • + {% endfor %} +
    ## Past institutional partners
      - {% for company in partners.past %} + {% for company in sponsors.past if company.kind == "partner" %}
    • {{ company.name }}
    • {% endfor %}
    diff --git a/web/pandas/config.yml b/web/pandas/config.yml index d1fb7ba0f7b86..d041d6dd2ac95 100644 --- a/web/pandas/config.yml +++ b/web/pandas/config.yml @@ -99,30 +99,50 @@ maintainers: - Wes McKinney - Jeff Reback - Joris Van den Bossche -partners: +sponsors: active: - name: "NumFOCUS" url: https://numfocus.org/ logo: /static/img/partners/numfocus.svg + kind: numfocus - name: "Anaconda" url: https://www.anaconda.com/ logo: /static/img/partners/anaconda.svg - employs: "Tom Augspurger, Brock Mendel" + kind: partner + description: "Tom Augspurger, Brock Mendel" - name: "Two Sigma" url: https://www.twosigma.com/ logo: /static/img/partners/two_sigma.svg - employs: "Phillip Cloud, Jeff Reback" + kind: partner + description: "Phillip Cloud, Jeff Reback" - name: "RStudio" url: https://www.rstudio.com/ logo: /static/img/partners/r_studio.svg - employs: "Wes McKinney" + kind: partner + description: "Wes McKinney" - name: "Ursa Labs" url: https://ursalabs.org/ logo: /static/img/partners/ursa_labs.svg - employs: "Wes McKinney, Joris Van den Bossche" + kind: partner + description: "Wes McKinney, Joris Van den Bossche" - name: "Tidelift" url: https://tidelift.com logo: /static/img/partners/tidelift.svg + kind: regular + description: "pandas is part of the Tidelift subscription. You can support pandas by becoming a Tidelift subscriber." + - name: "Chan Zuckerberg Initiative" + url: https://chanzuckerberg.com/ + logo: /static/img/partners/czi.svg + kind: regular + description: "pandas is funded by the Essential Open Source Software for Science program of the Chan Zuckerberg Initiative. The funding is used for general maintainance, improve extension types, and a efficient string type." + inkind: # not included in active so they don't appear in the home page + - name: "OVH" + url: https://us.ovhcloud.com/ + description: "Website and documentation hosting." + - name: "Indeed" + url: https://opensource.indeedeng.io/ + description: "pandas logo design" past: - name: "Paris-Saclay Center for Data Science" url: https://www.datascience-paris-saclay.fr/ + kind: partner diff --git a/web/pandas/index.html b/web/pandas/index.html index 5aac5da16295b..0f4598add4efc 100644 --- a/web/pandas/index.html +++ b/web/pandas/index.html @@ -43,15 +43,20 @@
    Community
    With the support of:
    -
    - {% for company in partners.active %} -
    - - {{ company.name }} - -
    - {% endfor %} -
    + {% for row in sponsors.active | batch(6, "") %} +
    + {% for company in row %} +
    + {% if company %} + + {{ company.name }} + + {% endif %} +
    + {% endfor %} +
    + {% endfor %} +

    The full list of companies supporting pandas is available in the sponsors page.

    diff --git a/web/pandas/static/img/partners/czi.svg b/web/pandas/static/img/partners/czi.svg new file mode 100644 index 0000000000000..b0ad9eb80580b --- /dev/null +++ b/web/pandas/static/img/partners/czi.svg @@ -0,0 +1,38 @@ + + + + Group + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/web/pandas_web.py b/web/pandas_web.py old mode 100644 new mode 100755 From 0f04c6af5eea56f69d046eb1603c25f4b8d45919 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Fri, 17 Jan 2020 04:24:18 +0200 Subject: [PATCH 079/158] CI/TST: fix failing tests in py37_np_dev (#31091) --- pandas/tests/arrays/categorical/test_algos.py | 2 +- pandas/tests/indexes/multi/test_analytics.py | 2 +- pandas/tests/indexes/period/test_indexing.py | 2 +- pandas/tests/series/indexing/test_indexing.py | 2 +- pandas/tests/test_take.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py index 52640044565fc..07abdceb71f9f 100644 --- a/pandas/tests/arrays/categorical/test_algos.py +++ b/pandas/tests/arrays/categorical/test_algos.py @@ -111,7 +111,7 @@ def test_take_bounds(self, allow_fill): if allow_fill: msg = "indices are out-of-bounds" else: - msg = "index 4 is out of bounds for size 3" + msg = "index 4 is out of bounds for( axis 0 with|) size 3" with pytest.raises(IndexError, match=msg): cat.take([4, 5], allow_fill=allow_fill) diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py index 209cc627aba8b..5d441b0fa8091 100644 --- a/pandas/tests/indexes/multi/test_analytics.py +++ b/pandas/tests/indexes/multi/test_analytics.py @@ -218,7 +218,7 @@ def test_take_fill_value(): with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) - msg = "index -5 is out of bounds for size 4" + msg = "index -5 is out of bounds for( axis 0 with|) size 4" with pytest.raises(IndexError, match=msg): idx.take(np.array([1, -5])) diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 592dccc5fc8ed..900b1cc91d905 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -357,7 +357,7 @@ def test_take_fill_value(self): with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) - msg = "index -5 is out of bounds for size 3" + msg = "index -5 is out of bounds for( axis 0 with|) size 3" with pytest.raises(IndexError, match=msg): idx.take(np.array([1, -5])) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index d552ce739d91c..4bc5bb90f2cb7 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -894,7 +894,7 @@ def test_take(): expected = Series([4, 2, 4], index=[4, 3, 4]) tm.assert_series_equal(actual, expected) - msg = "index {} is out of bounds for size 5" + msg = "index {} is out of bounds for( axis 0 with|) size 5" with pytest.raises(IndexError, match=msg.format(10)): s.take([1, 10]) with pytest.raises(IndexError, match=msg.format(5)): diff --git a/pandas/tests/test_take.py b/pandas/tests/test_take.py index 1cd5f11057464..d7c6496e0ae5b 100644 --- a/pandas/tests/test_take.py +++ b/pandas/tests/test_take.py @@ -428,7 +428,7 @@ def test_bounds_check_large(self): with pytest.raises(IndexError, match=msg): algos.take(arr, [2, 3], allow_fill=True) - msg = "index 2 is out of bounds for size 2" + msg = "index 2 is out of bounds for( axis 0 with|) size 2" with pytest.raises(IndexError, match=msg): algos.take(arr, [2, 3], allow_fill=False) From d21638b738e46884d2d42d3203e09d9e452f8c28 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Fri, 17 Jan 2020 12:13:54 +0200 Subject: [PATCH 080/158] CLN: Regular expression clean as @jbrockmendel suggested (#31098) --- pandas/tests/arrays/categorical/test_algos.py | 2 +- pandas/tests/indexes/multi/test_analytics.py | 2 +- pandas/tests/indexes/period/test_indexing.py | 2 +- pandas/tests/series/indexing/test_indexing.py | 2 +- pandas/tests/test_take.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py index 07abdceb71f9f..5ff0bb8ef0d78 100644 --- a/pandas/tests/arrays/categorical/test_algos.py +++ b/pandas/tests/arrays/categorical/test_algos.py @@ -111,7 +111,7 @@ def test_take_bounds(self, allow_fill): if allow_fill: msg = "indices are out-of-bounds" else: - msg = "index 4 is out of bounds for( axis 0 with|) size 3" + msg = "index 4 is out of bounds for( axis 0 with)? size 3" with pytest.raises(IndexError, match=msg): cat.take([4, 5], allow_fill=allow_fill) diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py index 5d441b0fa8091..2db61d4f4b852 100644 --- a/pandas/tests/indexes/multi/test_analytics.py +++ b/pandas/tests/indexes/multi/test_analytics.py @@ -218,7 +218,7 @@ def test_take_fill_value(): with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) - msg = "index -5 is out of bounds for( axis 0 with|) size 4" + msg = "index -5 is out of bounds for( axis 0 with)? size 4" with pytest.raises(IndexError, match=msg): idx.take(np.array([1, -5])) diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 900b1cc91d905..1e3160980e8bb 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -357,7 +357,7 @@ def test_take_fill_value(self): with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) - msg = "index -5 is out of bounds for( axis 0 with|) size 3" + msg = "index -5 is out of bounds for( axis 0 with)? size 3" with pytest.raises(IndexError, match=msg): idx.take(np.array([1, -5])) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 4bc5bb90f2cb7..65731cf45bd2d 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -894,7 +894,7 @@ def test_take(): expected = Series([4, 2, 4], index=[4, 3, 4]) tm.assert_series_equal(actual, expected) - msg = "index {} is out of bounds for( axis 0 with|) size 5" + msg = "index {} is out of bounds for( axis 0 with)? size 5" with pytest.raises(IndexError, match=msg.format(10)): s.take([1, 10]) with pytest.raises(IndexError, match=msg.format(5)): diff --git a/pandas/tests/test_take.py b/pandas/tests/test_take.py index d7c6496e0ae5b..1d2ab9358c01c 100644 --- a/pandas/tests/test_take.py +++ b/pandas/tests/test_take.py @@ -428,7 +428,7 @@ def test_bounds_check_large(self): with pytest.raises(IndexError, match=msg): algos.take(arr, [2, 3], allow_fill=True) - msg = "index 2 is out of bounds for( axis 0 with|) size 2" + msg = "index 2 is out of bounds for( axis 0 with)? size 2" with pytest.raises(IndexError, match=msg): algos.take(arr, [2, 3], allow_fill=False) From a72eef5a86979232337149961d2122f0b555c876 Mon Sep 17 00:00:00 2001 From: Galuh Sahid Date: Fri, 17 Jan 2020 20:27:37 +0700 Subject: [PATCH 081/158] DOC: Add missing docstrings in pd.Index (#31047) --- pandas/core/indexes/base.py | 282 +++++++++++++++++++++++++++++++++++- 1 file changed, 281 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 22a0097c6b95f..3cec698cec64d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1164,6 +1164,9 @@ def to_frame(self, index=True, name=None): @property def name(self): + """ + Return Index or MultiIndex name. + """ return self._name @name.setter @@ -1645,21 +1648,230 @@ def is_unique(self) -> bool: @property def has_duplicates(self) -> bool: + """ + Check if the Index has duplicate values. + + Returns + ------- + bool + Whether or not the Index has duplicate values. + + Examples + -------- + >>> idx = pd.Index([1, 5, 7, 7]) + >>> idx.has_duplicates + True + + >>> idx = pd.Index([1, 5, 7]) + >>> idx.has_duplicates + False + + >>> idx = pd.Index(["Watermelon", "Orange", "Apple", + ... "Watermelon"]).astype("category") + >>> idx.has_duplicates + True + + >>> idx = pd.Index(["Orange", "Apple", + ... "Watermelon"]).astype("category") + >>> idx.has_duplicates + False + """ return not self.is_unique def is_boolean(self) -> bool: + """ + Check if the Index only consists of booleans. + + Returns + ------- + bool + Whether or not the Index only consists of booleans. + + See Also + -------- + is_integer : Check if the Index only consists of integers. + is_floating : Check if the Index is a floating type. + is_numeric : Check if the Index only consists of numeric data. + is_object : Check if the Index is of the object dtype. + is_categorical : Check if the Index holds categorical data. + is_interval : Check if the Index holds Interval objects. + is_mixed : Check if the Index holds data with mixed data types. + + Examples + -------- + >>> idx = pd.Index([True, False, True]) + >>> idx.is_boolean() + True + + >>> idx = pd.Index(["True", "False", "True"]) + >>> idx.is_boolean() + False + + >>> idx = pd.Index([True, False, "True"]) + >>> idx.is_boolean() + False + """ return self.inferred_type in ["boolean"] def is_integer(self) -> bool: + """ + Check if the Index only consists of integers. + + Returns + ------- + bool + Whether or not the Index only consists of integers. + + See Also + -------- + is_boolean : Check if the Index only consists of booleans. + is_floating : Check if the Index is a floating type. + is_numeric : Check if the Index only consists of numeric data. + is_object : Check if the Index is of the object dtype. + is_categorical : Check if the Index holds categorical data. + is_interval : Check if the Index holds Interval objects. + is_mixed : Check if the Index holds data with mixed data types. + + Examples + -------- + >>> idx = pd.Index([1, 2, 3, 4]) + >>> idx.is_integer() + True + + >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0]) + >>> idx.is_integer() + False + + >>> idx = pd.Index(["Apple", "Mango", "Watermelon"]) + >>> idx.is_integer() + False + """ return self.inferred_type in ["integer"] def is_floating(self) -> bool: + """ + Check if the Index is a floating type. + + The Index may consist of only floats, NaNs, or a mix of floats, + integers, or NaNs. + + Returns + ------- + bool + Whether or not the Index only consists of only consists of floats, NaNs, or + a mix of floats, integers, or NaNs. + + See Also + -------- + is_boolean : Check if the Index only consists of booleans. + is_integer : Check if the Index only consists of integers. + is_numeric : Check if the Index only consists of numeric data. + is_object : Check if the Index is of the object dtype. + is_categorical : Check if the Index holds categorical data. + is_interval : Check if the Index holds Interval objects. + is_mixed : Check if the Index holds data with mixed data types. + + Examples + -------- + >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0]) + >>> idx.is_floating() + True + + >>> idx = pd.Index([1.0, 2.0, np.nan, 4.0]) + >>> idx.is_floating() + True + + >>> idx = pd.Index([1, 2, 3, 4, np.nan]) + >>> idx.is_floating() + True + + >>> idx = pd.Index([1, 2, 3, 4]) + >>> idx.is_floating() + False + """ return self.inferred_type in ["floating", "mixed-integer-float", "integer-na"] def is_numeric(self) -> bool: + """ + Check if the Index only consists of numeric data. + + Returns + ------- + bool + Whether or not the Index only consists of numeric data. + + See Also + -------- + is_boolean : Check if the Index only consists of booleans. + is_integer : Check if the Index only consists of integers. + is_floating : Check if the Index is a floating type. + is_object : Check if the Index is of the object dtype. + is_categorical : Check if the Index holds categorical data. + is_interval : Check if the Index holds Interval objects. + is_mixed : Check if the Index holds data with mixed data types. + + Examples + -------- + >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0]) + >>> idx.is_numeric() + True + + >>> idx = pd.Index([1, 2, 3, 4.0]) + >>> idx.is_numeric() + True + + >>> idx = pd.Index([1, 2, 3, 4]) + >>> idx.is_numeric() + True + + >>> idx = pd.Index([1, 2, 3, 4.0, np.nan]) + >>> idx.is_numeric() + True + + >>> idx = pd.Index([1, 2, 3, 4.0, np.nan, "Apple"]) + >>> idx.is_numeric() + False + """ return self.inferred_type in ["integer", "floating"] def is_object(self) -> bool: + """ + Check if the Index is of the object dtype. + + Returns + ------- + bool + Whether or not the Index is of the object dtype. + + See Also + -------- + is_boolean : Check if the Index only consists of booleans. + is_integer : Check if the Index only consists of integers. + is_floating : Check if the Index is a floating type. + is_numeric : Check if the Index only consists of numeric data. + is_categorical : Check if the Index holds categorical data. + is_interval : Check if the Index holds Interval objects. + is_mixed : Check if the Index holds data with mixed data types. + + Examples + -------- + >>> idx = pd.Index(["Apple", "Mango", "Watermelon"]) + >>> idx.is_object() + True + + >>> idx = pd.Index(["Apple", "Mango", 2.0]) + >>> idx.is_object() + True + + >>> idx = pd.Index(["Watermelon", "Orange", "Apple", + ... "Watermelon"]).astype("category") + >>> idx.object() + False + + >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0]) + >>> idx.is_object() + False + """ return is_object_dtype(self.dtype) def is_categorical(self) -> bool: @@ -1668,12 +1880,19 @@ def is_categorical(self) -> bool: Returns ------- - boolean + bool True if the Index is categorical. See Also -------- CategoricalIndex : Index for categorical data. + is_boolean : Check if the Index only consists of booleans. + is_integer : Check if the Index only consists of integers. + is_floating : Check if the Index is a floating type. + is_numeric : Check if the Index only consists of numeric data. + is_object : Check if the Index is of the object dtype. + is_interval : Check if the Index holds Interval objects. + is_mixed : Check if the Index holds data with mixed data types. Examples -------- @@ -1699,9 +1918,67 @@ def is_categorical(self) -> bool: return self.inferred_type in ["categorical"] def is_interval(self) -> bool: + """ + Check if the Index holds Interval objects. + + Returns + ------- + bool + Whether or not the Index holds Interval objects. + + See Also + -------- + IntervalIndex : Index for Interval objects. + is_boolean : Check if the Index only consists of booleans. + is_integer : Check if the Index only consists of integers. + is_floating : Check if the Index is a floating type. + is_numeric : Check if the Index only consists of numeric data. + is_object : Check if the Index is of the object dtype. + is_categorical : Check if the Index holds categorical data. + is_mixed : Check if the Index holds data with mixed data types. + + Examples + -------- + >>> idx = pd.Index([pd.Interval(left=0, right=5), + ... pd.Interval(left=5, right=10)]) + >>> idx.is_interval() + True + + >>> idx = pd.Index([1, 3, 5, 7]) + >>> idx.is_interval() + False + """ return self.inferred_type in ["interval"] def is_mixed(self) -> bool: + """ + Check if the Index holds data with mixed data types. + + Returns + ------- + bool + Whether or not the Index holds data with mixed data types. + + See Also + -------- + is_boolean : Check if the Index only consists of booleans. + is_integer : Check if the Index only consists of integers. + is_floating : Check if the Index is a floating type. + is_numeric : Check if the Index only consists of numeric data. + is_object : Check if the Index is of the object dtype. + is_categorical : Check if the Index holds categorical data. + is_interval : Check if the Index holds Interval objects. + + Examples + -------- + >>> idx = pd.Index(['a', np.nan, 'b']) + >>> idx.is_mixed() + True + + >>> idx = pd.Index([1.0, 2.0, 3.0, 5.0]) + >>> idx.is_mixed() + False + """ return self.inferred_type in ["mixed"] def holds_integer(self): @@ -1719,6 +1996,9 @@ def inferred_type(self): @cache_readonly def is_all_dates(self) -> bool: + """ + Whether or not the index values only consist of dates. + """ return is_datetime_array(ensure_object(self.values)) # -------------------------------------------------------------------- From 47f4e61f27ab28a2f2bbece5f3597272214bfcb6 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Fri, 17 Jan 2020 23:53:34 +0800 Subject: [PATCH 082/158] ENH: handles min_periods argument in rolling.count (GH26996) --- pandas/core/window/rolling.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 43dd9a911603b..c79394a79974b 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1185,8 +1185,14 @@ def count(self): window = self._get_window() window = min(window, len(obj)) if not self.center else window - min_periods = self.min_periods if self.min_periods is not None else 0 - min_periods = min(min_periods, len(obj)) if not self.center else min_periods + + # We set the default value min_periods to be 0 because count method + # is meant to count NAs, we don't want it by default requires all + # values in the window to be valid to produce a valid count + min_periods = 0 if self.min_periods is None else self.min_periods + + # this is required as window is mutate above + min_periods = min(min_periods, window) results = [] for b in blocks: From fb0f96ef7a4f20848717cf330077ec955ce7025a Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Fri, 17 Jan 2020 16:51:20 +0000 Subject: [PATCH 083/158] WEB: Styling blog (#31094) --- web/pandas/community/blog.html | 8 ++++---- web/pandas_web.py | 7 +++++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/web/pandas/community/blog.html b/web/pandas/community/blog.html index ffe6f97d679e4..627aaa450893b 100644 --- a/web/pandas/community/blog.html +++ b/web/pandas/community/blog.html @@ -4,10 +4,10 @@ {% for post in blog.posts %}
    -

    {{ post.title }}

    -
    Source: {{ post.feed }} | Author: {{ post.author }} | Published: {{ post.published.strftime("%b %d, %Y") }}
    -
    {{ post.summary }}
    - Read +
    {{ post.title }}
    +
    Source: {{ post.feed }} | Author: {{ post.author }} | Published: {{ post.published.strftime("%b %d, %Y") }}
    +
    {{ post.summary }}
    + Read more
    {% endfor %} diff --git a/web/pandas_web.py b/web/pandas_web.py index d515d8a0e1cd7..45dafcf0c4c10 100755 --- a/web/pandas_web.py +++ b/web/pandas_web.py @@ -28,14 +28,15 @@ import importlib import operator import os +import re import shutil import sys import time import typing import feedparser -import markdown import jinja2 +import markdown import requests import yaml @@ -74,6 +75,7 @@ def blog_add_posts(context): preprocessor fetches the posts in the feeds, and returns the relevant information for them (sorted from newest to oldest). """ + tag_expr = re.compile("<.*?>") posts = [] for feed_url in context["blog"]["feed"]: feed_data = feedparser.parse(feed_url) @@ -81,6 +83,7 @@ def blog_add_posts(context): published = datetime.datetime.fromtimestamp( time.mktime(entry.published_parsed) ) + summary = re.sub(tag_expr, "", entry.summary) posts.append( { "title": entry.title, @@ -89,7 +92,7 @@ def blog_add_posts(context): "feed": feed_data["feed"]["title"], "link": entry.link, "description": entry.description, - "summary": entry.summary, + "summary": summary, } ) posts.sort(key=operator.itemgetter("published"), reverse=True) From a0c7a8ab5b544db598d01f083dd2edaf8cd3afe8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 17 Jan 2020 18:48:43 +0100 Subject: [PATCH 084/158] CLN: Index._values docstring + Block.internal/external_values (#31103) --- pandas/core/arrays/sparse/scipy_sparse.py | 8 ++++---- pandas/core/indexes/base.py | 13 +++++++------ pandas/core/indexes/interval.py | 4 ---- pandas/core/internals/blocks.py | 20 +++++++++++++------- pandas/core/internals/managers.py | 2 ++ 5 files changed, 26 insertions(+), 21 deletions(-) diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index 88d63071c360f..17a953fce9ec0 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -17,14 +17,14 @@ def _check_is_partition(parts, whole): def _to_ijv(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): - """ For arbitrary (MultiIndexed) SparseSeries return + """ For arbitrary (MultiIndexed) sparse Series return (v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for passing to scipy.sparse.coo constructor. """ # index and column levels must be a partition of the index _check_is_partition([row_levels, column_levels], range(ss.index.nlevels)) - # from the SparseSeries: get the labels and data for non-null entries - values = ss._data.internal_values()._valid_sp_values + # from the sparse Series: get the labels and data for non-null entries + values = ss.array._valid_sp_values nonnull_labels = ss.dropna() @@ -85,7 +85,7 @@ def _get_index_subset_to_coord_dict(index, subset, sort_labels=False): def _sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): """ - Convert a SparseSeries to a scipy.sparse.coo_matrix using index + Convert a sparse Series to a scipy.sparse.coo_matrix using index levels row_levels, column_levels as the row and column labels respectively. Returns the sparse_matrix, row and column labels. """ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3cec698cec64d..bb893bd2ffef6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3925,15 +3925,16 @@ def values(self): @property def _values(self) -> Union[ExtensionArray, ABCIndexClass, np.ndarray]: - # TODO(EA): remove index types as they become extension arrays """ The best array representation. - This is an ndarray, ExtensionArray, or Index subclass. This differs - from ``_ndarray_values``, which always returns an ndarray. + This is an ndarray or ExtensionArray. This differs from + ``_ndarray_values``, which always returns an ndarray. Both ``_values`` and ``_ndarray_values`` are consistent between - ``Series`` and ``Index``. + ``Series`` and ``Index`` (except for datetime64[ns], which returns + a DatetimeArray for _values on the Index, but ndarray[M8ns] on the + Series). It may differ from the public '.values' method. @@ -3941,8 +3942,8 @@ def _values(self) -> Union[ExtensionArray, ABCIndexClass, np.ndarray]: ----------------- | --------------- | ------------- | --------------- | Index | ndarray | ndarray | ndarray | CategoricalIndex | Categorical | Categorical | ndarray[int] | - DatetimeIndex | ndarray[M8ns] | ndarray[M8ns] | ndarray[M8ns] | - DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | ndarray[M8ns] | + DatetimeIndex | ndarray[M8ns] | DatetimeArray | ndarray[M8ns] | + DatetimeIndex[tz] | ndarray[M8ns] | DatetimeArray | ndarray[M8ns] | PeriodIndex | ndarray[object] | PeriodArray | ndarray[int] | IntervalIndex | IntervalArray | IntervalArray | ndarray[object] | diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 8da6907750ac7..4df1d25c7ff0c 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -405,10 +405,6 @@ def values(self): """ return self._data - @cache_readonly - def _values(self): - return self._data - def __array_wrap__(self, result, context=None): # we don't want the superclass implementation return result diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 5fe5290fa65f1..cb702a81d2bde 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -192,13 +192,19 @@ def is_categorical_astype(self, dtype): return False - def external_values(self, dtype=None): - """ return an outside world format, currently just the ndarray """ + def external_values(self): + """ + The array that Series.values returns (public attribute). + This has some historical constraints, and is overridden in block + subclasses to return the correct array (e.g. period returns + object ndarray and datetimetz a datetime64[ns] ndarray instead of + proper extension array). + """ return self.values - def internal_values(self, dtype=None): - """ return an internal format, currently just the ndarray - this should be the pure internal API format + def internal_values(self): + """ + The array that Series._values returns (internal values). """ return self.values @@ -1966,7 +1972,7 @@ class ObjectValuesExtensionBlock(ExtensionBlock): Series[T].values is an ndarray of objects. """ - def external_values(self, dtype=None): + def external_values(self): return self.values.astype(object) @@ -2482,7 +2488,7 @@ def to_native_types(self, slicer=None, na_rep=None, quoting=None, **kwargs): ) return rvalues - def external_values(self, dtype=None): + def external_values(self): return np.asarray(self.values.astype("timedelta64[ns]", copy=False)) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 24cc551ad0e45..39e0aa078638f 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1531,9 +1531,11 @@ def get_dtypes(self): return np.array([self._block.dtype]) def external_values(self): + """The array that Series.values returns""" return self._block.external_values() def internal_values(self): + """The array that Series._values returns""" return self._block.internal_values() def get_values(self): From 641346cd27f4788f0c628170d0178eab6f404c81 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Sat, 18 Jan 2020 01:29:04 +0200 Subject: [PATCH 085/158] TST: Fix some bare pytest raises (#31105) --- pandas/tests/internals/test_internals.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 15b1434f8629f..9c1442b75fbb2 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1200,7 +1200,7 @@ def test_binop_other(self, op, value, dtype): (operator.pow, "bool"), } if (op, dtype) in skip: - pytest.skip("Invalid combination {},{}".format(op, dtype)) + pytest.skip(f"Invalid combination {op},{dtype}") e = DummyElement(value, dtype) s = pd.DataFrame({"A": [e.value, e.value]}, dtype=e.dtype) @@ -1216,7 +1216,17 @@ def test_binop_other(self, op, value, dtype): } if (op, dtype) in invalid: - with pytest.raises(TypeError): + msg = ( + None + if (dtype == " Date: Sat, 18 Jan 2020 00:57:03 +0100 Subject: [PATCH 086/158] BUG: pd.crosstab(s1, s2) handle column index incorrectly when both series have tuple names (#30978) --- doc/source/whatsnew/v1.1.0.rst | 2 ++ pandas/core/reshape/pivot.py | 10 +++++++++- pandas/tests/reshape/test_pivot.py | 13 +++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b5a7b19f160a4..8133e54c934ad 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -142,6 +142,8 @@ Reshaping - - Bug in :meth:`DataFrame.pivot_table` when only MultiIndexed columns is set (:issue:`17038`) +- Bug in :func:`crosstab` when inputs are two Series and have tuple names, the output will keep dummy MultiIndex as columns. (:issue:`18321`) + Sparse ^^^^^^ diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 7109f23761188..13df39cc0011b 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -581,6 +581,8 @@ def crosstab( from pandas import DataFrame df = DataFrame(data, index=common_idx) + original_df_cols = df.columns + if values is None: df["__dummy__"] = 0 kwargs = {"aggfunc": len, "fill_value": 0} @@ -589,7 +591,7 @@ def crosstab( kwargs = {"aggfunc": aggfunc} table = df.pivot_table( - "__dummy__", + ["__dummy__"], index=rownames, columns=colnames, margins=margins, @@ -598,6 +600,12 @@ def crosstab( **kwargs, ) + # GH18321, after pivoting, an extra top level of column index of `__dummy__` is + # created, and this extra level should not be included in the further steps + if not table.empty: + cols_diff = df.columns.difference(original_df_cols)[0] + table = table[cols_diff] + # Post-process if normalize is not False: table = _normalize( diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index e3a57da450334..a2e6a19996668 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2557,6 +2557,19 @@ def test_crosstab_tuple_name(self, names): result = pd.crosstab(s1, s2) tm.assert_frame_equal(result, expected) + def test_crosstab_both_tuple_names(self): + # GH 18321 + s1 = pd.Series(range(3), name=("a", "b")) + s2 = pd.Series(range(3), name=("c", "d")) + + expected = pd.DataFrame( + np.eye(3, dtype="int64"), + index=pd.Index(range(3), name=("a", "b")), + columns=pd.Index(range(3), name=("c", "d")), + ) + result = crosstab(s1, s2) + tm.assert_frame_equal(result, expected) + def test_crosstab_unsorted_order(self): df = pd.DataFrame({"b": [3, 1, 2], "a": [5, 4, 6]}, index=["C", "A", "B"]) result = pd.crosstab(df.index, [df.b, df.a]) From f873fb9150726dde4deb6b9ed5f186b57f4834d0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 18 Jan 2020 03:54:21 -0800 Subject: [PATCH 087/158] TYP: annotations (#31115) --- pandas/core/indexes/base.py | 6 +++--- pandas/core/indexes/category.py | 2 +- pandas/core/indexes/interval.py | 6 +++--- pandas/core/indexes/multi.py | 6 +++--- pandas/core/indexes/numeric.py | 2 +- pandas/core/indexes/range.py | 4 ++-- pandas/tseries/frequencies.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bb893bd2ffef6..bfa560ccae068 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4566,7 +4566,7 @@ def shift(self, periods=1, freq=None): """ raise NotImplementedError(f"Not supported for type {type(self).__name__}") - def argsort(self, *args, **kwargs): + def argsort(self, *args, **kwargs) -> np.ndarray: """ Return the integer indices that would sort the index. @@ -5052,7 +5052,7 @@ def _searchsorted_monotonic(self, label, side="left"): raise ValueError("index must be monotonic increasing or decreasing") - def get_slice_bound(self, label, side, kind): + def get_slice_bound(self, label, side, kind) -> int: """ Calculate slice bound that corresponds to given label. @@ -5217,7 +5217,7 @@ def delete(self, loc): """ return self._shallow_copy(np.delete(self._data, loc)) - def insert(self, loc, item): + def insert(self, loc: int, item): """ Make new Index inserting new item at location. diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index a247a986fcb55..a7afa78190d90 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -798,7 +798,7 @@ def delete(self, loc): """ return self._create_from_codes(np.delete(self.codes, loc)) - def insert(self, loc, item): + def insert(self, loc: int, item): """ Make new Index inserting new item at location. Follows Python list.append semantics for negative values diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 4df1d25c7ff0c..1b851ca38459a 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -468,7 +468,7 @@ def is_unique(self): return True @property - def is_overlapping(self): + def is_overlapping(self) -> bool: """ Return True if the IntervalIndex has overlapping intervals, else False. @@ -562,7 +562,7 @@ def _can_reindex(self, indexer: np.ndarray) -> None: if self.is_overlapping and len(indexer): raise ValueError("cannot reindex from an overlapping axis") - def _needs_i8_conversion(self, key): + def _needs_i8_conversion(self, key) -> bool: """ Check if a given key needs i8 conversion. Conversion is necessary for Timestamp, Timedelta, DatetimeIndex, and TimedeltaIndex keys. An @@ -1036,7 +1036,7 @@ def _format_space(self) -> str: # -------------------------------------------------------------------- - def argsort(self, *args, **kwargs): + def argsort(self, *args, **kwargs) -> np.ndarray: return np.lexsort((self.right, self.left)) def equals(self, other) -> bool: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 21421a6f6ea62..10a2d9f68a7b6 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2013,7 +2013,7 @@ def append(self, other): except (TypeError, IndexError): return Index(new_tuples) - def argsort(self, *args, **kwargs): + def argsort(self, *args, **kwargs) -> np.ndarray: return self.values.argsort(*args, **kwargs) @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) @@ -3135,7 +3135,7 @@ def equals(self, other) -> bool: return True - def equal_levels(self, other): + def equal_levels(self, other) -> bool: """ Return True if the levels of both MultiIndex objects are the same @@ -3335,7 +3335,7 @@ def _convert_can_do_setop(self, other): result_names = self.names if self.names == other.names else None return other, result_names - def insert(self, loc, item): + def insert(self, loc: int, item): """ Make new MultiIndex inserting new item at location diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 53f96ace890fb..2638784f7b50d 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -160,7 +160,7 @@ def is_all_dates(self) -> bool: return False @Appender(Index.insert.__doc__) - def insert(self, loc, item): + def insert(self, loc: int, item): # treat NA values as nans: if is_scalar(item) and isna(item): item = self._na_value diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 1629396796b85..67eb5c26fc83a 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -421,7 +421,7 @@ def max(self, axis=None, skipna=True, *args, **kwargs): nv.validate_max(args, kwargs) return self._minmax("max") - def argsort(self, *args, **kwargs): + def argsort(self, *args, **kwargs) -> np.ndarray: """ Returns the indices that would sort the index and its underlying data. @@ -441,7 +441,7 @@ def argsort(self, *args, **kwargs): else: return np.arange(len(self) - 1, -1, -1) - def equals(self, other): + def equals(self, other) -> bool: """ Determines if two Index objects contain the same elements. """ diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index e2d007cd2d7f8..af34180fb3170 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -334,7 +334,7 @@ def is_unique(self) -> bool: return len(self.deltas) == 1 @cache_readonly - def is_unique_asi8(self): + def is_unique_asi8(self) -> bool: return len(self.deltas_asi8) == 1 def get_freq(self) -> Optional[str]: From 9c33464768071184a6e26dcae79f75b1abf840a0 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 18 Jan 2020 07:34:12 -0800 Subject: [PATCH 088/158] JSON Date Handling 1.0 Regressions (#30977) --- pandas/_libs/src/ujson/python/objToJSON.c | 122 ++++++++++++++-------- pandas/tests/io/json/test_pandas.py | 26 +++++ 2 files changed, 102 insertions(+), 46 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index c413a16f8d5f0..c5ac279ed3243 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -456,8 +456,8 @@ static char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base, static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, size_t *len) { - if (!PyDateTime_Check(obj)) { - PyErr_SetString(PyExc_TypeError, "Expected datetime object"); + if (!PyDate_Check(obj)) { + PyErr_SetString(PyExc_TypeError, "Expected date object"); return NULL; } @@ -469,7 +469,7 @@ static npy_datetime PyDateTimeToEpoch(PyObject *obj, NPY_DATETIMEUNIT base) { npy_datetimestruct dts; int ret; - if (!PyDateTime_Check(obj)) { + if (!PyDate_Check(obj)) { // TODO: raise TypeError } PyDateTime_Date *dt = (PyDateTime_Date *)obj; @@ -1504,6 +1504,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, char **ret; char *dataptr, *cLabel; int type_num; + NPY_DATETIMEUNIT base = enc->datetimeUnit; PRINTMARK(); if (!labels) { @@ -1541,32 +1542,10 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, break; } - // TODO: vectorized timedelta solution - if (enc->datetimeIso && - (type_num == NPY_TIMEDELTA || PyDelta_Check(item))) { - PyObject *td = PyObject_CallFunction(cls_timedelta, "(O)", item); - if (td == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - PyObject *iso = PyObject_CallMethod(td, "isoformat", NULL); - Py_DECREF(td); - if (iso == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - cLabel = (char *)PyUnicode_AsUTF8(iso); - Py_DECREF(iso); - len = strlen(cLabel); - } else if (PyTypeNum_ISDATETIME(type_num)) { - NPY_DATETIMEUNIT base = enc->datetimeUnit; - npy_int64 longVal; + int is_datetimelike = 0; + npy_int64 nanosecVal; + if (PyTypeNum_ISDATETIME(type_num)) { + is_datetimelike = 1; PyArray_VectorUnaryFunc *castfunc = PyArray_GetCastFunc(PyArray_DescrFromType(type_num), NPY_INT64); if (!castfunc) { @@ -1574,27 +1553,74 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, "Cannot cast numpy dtype %d to long", enc->npyType); } - castfunc(dataptr, &longVal, 1, NULL, NULL); - if (enc->datetimeIso) { - cLabel = int64ToIso(longVal, base, &len); + castfunc(dataptr, &nanosecVal, 1, NULL, NULL); + } else if (PyDate_Check(item) || PyDelta_Check(item)) { + is_datetimelike = 1; + if (PyObject_HasAttrString(item, "value")) { + nanosecVal = get_long_attr(item, "value"); } else { - if (!scaleNanosecToUnit(&longVal, base)) { - // TODO: This gets hit but somehow doesn't cause errors - // need to clean up (elsewhere in module as well) + if (PyDelta_Check(item)) { + nanosecVal = total_seconds(item) * + 1000000000LL; // nanoseconds per second + } else { + // datetime.* objects don't follow above rules + nanosecVal = PyDateTimeToEpoch(item, NPY_FR_ns); } - cLabel = PyObject_Malloc(21); // 21 chars for int64 - sprintf(cLabel, "%" NPY_INT64_FMT, longVal); - len = strlen(cLabel); } - } else if (PyDateTime_Check(item) || PyDate_Check(item)) { - NPY_DATETIMEUNIT base = enc->datetimeUnit; - if (enc->datetimeIso) { - cLabel = PyDateTimeToIso((PyDateTime_Date *)item, base, &len); + } + + if (is_datetimelike) { + if (nanosecVal == get_nat()) { + len = 5; // TODO: shouldn't require extra space for terminator + cLabel = PyObject_Malloc(len); + strncpy(cLabel, "null", len); } else { - cLabel = PyObject_Malloc(21); // 21 chars for int64 - sprintf(cLabel, "%" NPY_DATETIME_FMT, - PyDateTimeToEpoch(item, base)); - len = strlen(cLabel); + if (enc->datetimeIso) { + // TODO: Vectorized Timedelta function + if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) { + PyObject *td = + PyObject_CallFunction(cls_timedelta, "(O)", item); + if (td == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + PyObject *iso = + PyObject_CallMethod(td, "isoformat", NULL); + Py_DECREF(td); + if (iso == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + len = strlen(PyUnicode_AsUTF8(iso)); + cLabel = PyObject_Malloc(len + 1); + memcpy(cLabel, PyUnicode_AsUTF8(iso), len + 1); + Py_DECREF(iso); + } else { + if (type_num == NPY_DATETIME) { + cLabel = int64ToIso(nanosecVal, base, &len); + } else { + cLabel = PyDateTimeToIso((PyDateTime_Date *)item, + base, &len); + } + } + if (cLabel == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + } else { + cLabel = PyObject_Malloc(21); // 21 chars for int64 + sprintf(cLabel, "%" NPY_DATETIME_FMT, + NpyDateTimeToEpoch(nanosecVal, base)); + len = strlen(cLabel); + } } } else { // Fallback to string representation PyObject *str = PyObject_Str(item); @@ -1615,6 +1641,10 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, ret[i] = PyObject_Malloc(len + 1); memcpy(ret[i], cLabel, len + 1); + if (is_datetimelike) { + PyObject_Free(cLabel); + } + if (PyErr_Occurred()) { NpyArr_freeLabels(ret, num); ret = 0; diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index e909a4952948c..bb873c71e8a35 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1,4 +1,5 @@ from collections import OrderedDict +import datetime from datetime import timedelta from io import StringIO import json @@ -810,6 +811,31 @@ def test_convert_dates(self): result = read_json(json, typ="series") tm.assert_series_equal(result, ts) + @pytest.mark.parametrize("date_format", ["epoch", "iso"]) + @pytest.mark.parametrize("as_object", [True, False]) + @pytest.mark.parametrize( + "date_typ", [datetime.date, datetime.datetime, pd.Timestamp] + ) + def test_date_index_and_values(self, date_format, as_object, date_typ): + data = [date_typ(year=2020, month=1, day=1), pd.NaT] + if as_object: + data.append("a") + + ser = pd.Series(data, index=data) + result = ser.to_json(date_format=date_format) + + if date_format == "epoch": + expected = '{"1577836800000":1577836800000,"null":null}' + else: + expected = ( + '{"2020-01-01T00:00:00.000Z":"2020-01-01T00:00:00.000Z","null":null}' + ) + + if as_object: + expected = expected.replace("}", ',"a":"a"}') + + assert result == expected + @pytest.mark.parametrize( "infer_word", [ From 4e9ee4df1d26399941089d84cfeebe7c34519993 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 18 Jan 2020 16:35:39 +0100 Subject: [PATCH 089/158] BUG: reductions for nullable dtypes should return pd.NA for skipna=False (#30971) --- doc/source/whatsnew/v1.0.0.rst | 19 +++++++++++++++++++ pandas/core/arrays/boolean.py | 8 +++++--- pandas/core/arrays/integer.py | 14 ++++++++------ pandas/tests/extension/test_boolean.py | 4 +++- pandas/tests/extension/test_integer.py | 10 +++++++++- 5 files changed, 44 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index fa562838c8f7c..3bd86bb02155f 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -483,6 +483,25 @@ Use :meth:`arrays.IntegerArray.to_numpy` with an explicit ``na_value`` instead. a.to_numpy(dtype="float", na_value=np.nan) +**Reductions can return ``pd.NA``** + +When performing a reduction such as a sum with ``skipna=False``, the result +will now be ``pd.NA`` instead of ``np.nan`` in presence of missing values +(:issue:`30958`). + +*pandas 0.25.x* + +.. code-block:: python + + >>> pd.Series(a).sum(skipna=False) + nan + +*pandas 1.0.0* + +.. ipython:: python + + pd.Series(a).sum(skipna=False) + **value_counts returns a nullable integer dtype** :meth:`Series.value_counts` with a nullable integer dtype now returns a nullable diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index fa1cbc87cc5c1..eaa17df1235d3 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -670,13 +670,15 @@ def _reduce(self, name, skipna=True, **kwargs): mask = self._mask # coerce to a nan-aware float if needed - if mask.any(): - data = self._data.astype("float64") - data[mask] = np.nan + if self._hasna: + data = self.to_numpy("float64", na_value=np.nan) op = getattr(nanops, "nan" + name) result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) + if np.isnan(result): + return libmissing.NA + # if we have numeric op that would result in an int, coerce to int if possible if name in ["sum", "prod"] and notna(result): int_result = np.int64(result) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index cb1e7115cd3c2..67036761bc62a 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -21,7 +21,7 @@ is_scalar, ) from pandas.core.dtypes.dtypes import register_extension_dtype -from pandas.core.dtypes.missing import isna, notna +from pandas.core.dtypes.missing import isna from pandas.core import nanops, ops from pandas.core.ops import invalid_comparison @@ -549,21 +549,23 @@ def _reduce(self, name, skipna=True, **kwargs): mask = self._mask # coerce to a nan-aware float if needed - if mask.any(): - data = self._data.astype("float64") - # We explicitly use NaN within reductions. - data[mask] = np.nan + # (we explicitly use NaN within reductions) + if self._hasna: + data = self.to_numpy("float64", na_value=np.nan) op = getattr(nanops, "nan" + name) result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) + if np.isnan(result): + return libmissing.NA + # if we have a boolean op, don't coerce if name in ["any", "all"]: pass # if we have a preservable numeric op, # provide coercion back to an integer type if possible - elif name in ["sum", "min", "max", "prod"] and notna(result): + elif name in ["sum", "min", "max", "prod"]: int_result = int(result) if int_result == result: result = int_result diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index a7ce0fb097599..c489445d8512a 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -327,7 +327,9 @@ def check_reduce(self, s, op_name, skipna): result = getattr(s, op_name)(skipna=skipna) expected = getattr(s.astype("float64"), op_name)(skipna=skipna) # override parent function to cast to bool for min/max - if op_name in ("min", "max") and not pd.isna(expected): + if np.isnan(expected): + expected = pd.NA + elif op_name in ("min", "max"): expected = bool(expected) tm.assert_almost_equal(result, expected) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index afb8412f12ea9..f55ec75b47dfa 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -19,6 +19,7 @@ from pandas.core.dtypes.common import is_extension_array_dtype import pandas as pd +import pandas._testing as tm from pandas.core.arrays import integer_array from pandas.core.arrays.integer import ( Int8Dtype, @@ -233,7 +234,14 @@ class TestGroupby(base.BaseGroupbyTests): class TestNumericReduce(base.BaseNumericReduceTests): - pass + def check_reduce(self, s, op_name, skipna): + # overwrite to ensure pd.NA is tested instead of np.nan + # https://github.com/pandas-dev/pandas/issues/30958 + result = getattr(s, op_name)(skipna=skipna) + expected = getattr(s.astype("float64"), op_name)(skipna=skipna) + if np.isnan(expected): + expected = pd.NA + tm.assert_almost_equal(result, expected) class TestBooleanReduce(base.BaseBooleanReduceTests): From f19035d3a9848f61f00f753c1b7aac334930d425 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 18 Jan 2020 07:49:03 -0800 Subject: [PATCH 090/158] REF: fix calls to Index.get_value (#31112) --- pandas/core/indexes/base.py | 41 +++++++++++++++------------------ pandas/core/indexes/category.py | 2 +- pandas/core/indexes/numeric.py | 14 +++++++---- pandas/core/series.py | 8 ++++--- 4 files changed, 35 insertions(+), 30 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bfa560ccae068..e7f306592fa8a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4621,34 +4621,31 @@ def argsort(self, *args, **kwargs) -> np.ndarray: @Appender(_index_shared_docs["get_value"] % _index_doc_kwargs) def get_value(self, series, key): + if not is_scalar(key): + # if key is not a scalar, directly raise an error (the code below + # would convert to numpy arrays and raise later any way) - GH29926 + raise InvalidIndexError(key) + # if we have something that is Index-like, then # use this, e.g. DatetimeIndex # Things like `Series._get_value` (via .at) pass the EA directly here. s = extract_array(series, extract_numpy=True) if isinstance(s, ExtensionArray): - if is_scalar(key): - # GH 20882, 21257 - # First try to convert the key to a location - # If that fails, raise a KeyError if an integer - # index, otherwise, see if key is an integer, and - # try that - try: - iloc = self.get_loc(key) - return s[iloc] - except KeyError: - if len(self) > 0 and (self.holds_integer() or self.is_boolean()): - raise - elif is_integer(key): - return s[key] - else: - # if key is not a scalar, directly raise an error (the code below - # would convert to numpy arrays and raise later any way) - GH29926 - raise InvalidIndexError(key) - - s = com.values_from_object(series) - k = com.values_from_object(key) + # GH 20882, 21257 + # First try to convert the key to a location + # If that fails, raise a KeyError if an integer + # index, otherwise, see if key is an integer, and + # try that + try: + iloc = self.get_loc(key) + return s[iloc] + except KeyError: + if len(self) > 0 and (self.holds_integer() or self.is_boolean()): + raise + elif is_integer(key): + return s[key] - k = self._convert_scalar_indexer(k, kind="getitem") + k = self._convert_scalar_indexer(key, kind="getitem") try: return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None)) except KeyError as e1: diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index a7afa78190d90..0ff6469d6b19c 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -503,8 +503,8 @@ def get_value(self, series: AnyArrayLike, key: Any): Any The element of the series at the position indicated by the key """ + k = key try: - k = com.values_from_object(key) k = self._convert_scalar_indexer(k, kind="getitem") indexer = self.get_loc(k) return series.take([indexer])[0] diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 2638784f7b50d..c5fca8652fed4 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -1,3 +1,5 @@ +from typing import TYPE_CHECKING + import numpy as np from pandas._libs import index as libindex, lib @@ -38,6 +40,9 @@ ) from pandas.core.ops import get_op_result_name +if TYPE_CHECKING: + from pandas import Series + _num_index_shared_docs = dict() @@ -438,17 +443,18 @@ def _format_native_types( ) return formatter.get_result_as_array() - def get_value(self, series, key): + def get_value(self, series: "Series", key): """ We always want to get an index value, never a value. """ if not is_scalar(key): raise InvalidIndexError - k = com.values_from_object(key) - loc = self.get_loc(k) - new_values = com.values_from_object(series)[loc] + loc = self.get_loc(key) + if not is_scalar(loc): + return series.iloc[loc] + new_values = series._values[loc] return new_values def equals(self, other) -> bool: diff --git a/pandas/core/series.py b/pandas/core/series.py index 22b347c39fc54..ec9475c6dcba9 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -808,6 +808,10 @@ def _slice(self, slobj: slice, axis: int = 0, kind=None) -> "Series": def __getitem__(self, key): key = com.apply_if_callable(key, self) + + if key is Ellipsis: + return self + try: result = self.index.get_value(self, key) @@ -830,8 +834,6 @@ def __getitem__(self, key): if isinstance(key, tuple) and isinstance(self.index, MultiIndex): # kludge pass - elif key is Ellipsis: - return self elif com.is_bool_indexer(key): pass else: @@ -939,7 +941,7 @@ def _get_value(self, label, takeable: bool = False): """ if takeable: return com.maybe_box_datetimelike(self._values[label]) - return self.index.get_value(self._values, label) + return self.index.get_value(self, label) def __setitem__(self, key, value): key = com.apply_if_callable(key, self) From 1c9f23cc540961b78b3c9e6dbef5698a3b626b65 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 18 Jan 2020 07:51:05 -0800 Subject: [PATCH 091/158] CLN: prune unreachable code (#31106) --- pandas/core/base.py | 4 +--- pandas/core/frame.py | 7 ------- pandas/core/generic.py | 21 +++------------------ pandas/core/indexes/base.py | 2 +- pandas/core/indexing.py | 7 ++----- pandas/core/internals/managers.py | 13 ++++--------- 6 files changed, 11 insertions(+), 43 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 66d7cd59dcfa4..c6800d282700f 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -583,12 +583,10 @@ def _is_builtin_func(self, arg): class ShallowMixin: _attributes: List[str] = [] - def _shallow_copy(self, obj=None, **kwargs): + def _shallow_copy(self, obj, **kwargs): """ return a new object with the replacement attributes """ - if obj is None: - obj = self._selected_obj.copy() if isinstance(obj, self._constructor): obj = obj.obj diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6dd3a415297db..1a49388d81243 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2749,14 +2749,7 @@ def _ixs(self, i: int, axis: int = 0): else: label = self.columns[i] - # if the values returned are not the same length - # as the index (iow a not found value), iget returns - # a 0-len ndarray. This is effectively catching - # a numpy error (as numpy should really raise) values = self._data.iget(i) - - if len(self.index) and not len(values): - values = np.array([np.nan] * len(self.index), dtype=object) result = self._box_col_values(values, label) # this is a cached value, mark it so diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6332ff45c59d0..0c5c119468994 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3173,19 +3173,6 @@ def to_csv( return None - # ---------------------------------------------------------------------- - # Fancy Indexing - - @classmethod - def _create_indexer(cls, name: str, indexer) -> None: - """Create an indexer like _name in the class. - - Kept for compatibility with geopandas. To be removed in the future. See GH27258 - """ - if getattr(cls, name, None) is None: - _indexer = functools.partial(indexer, name) - setattr(cls, name, property(_indexer, doc=indexer.__doc__)) - # ---------------------------------------------------------------------- # Lookup Caching @@ -3579,14 +3566,12 @@ def _set_item(self, key, value) -> None: self._data.set(key, value) self._clear_item_cache() - def _set_is_copy(self, ref=None, copy: bool_t = True) -> None: + def _set_is_copy(self, ref, copy: bool_t = True) -> None: if not copy: self._is_copy = None else: - if ref is not None: - self._is_copy = weakref.ref(ref) - else: - self._is_copy = None + assert ref is not None + self._is_copy = weakref.ref(ref) def _check_is_chained_assignment_possible(self) -> bool_t: """ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e7f306592fa8a..08629d9a61707 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3129,7 +3129,7 @@ def _convert_scalar_indexer(self, key, kind=None): if kind in ["getitem"] and is_float(key): if not self.is_floating(): - return self._invalid_indexer("label", key) + self._invalid_indexer("label", key) elif kind in ["loc"] and is_float(key): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 04503e5d98c10..63a86792082da 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -619,9 +619,8 @@ def _get_setitem_indexer(self, key): if isinstance(key, range): return list(key) - axis = self.axis or 0 try: - return self._convert_to_indexer(key, axis=axis) + return self._convert_to_indexer(key, axis=0) except TypeError as e: # invalid indexer type vs 'other' indexing errors @@ -1472,9 +1471,7 @@ def _get_listlike_indexer(self, key, axis: int, raise_missing: bool = False): else: keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr) - self._validate_read_indexer( - keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing - ) + self._validate_read_indexer(keyarr, indexer, axis, raise_missing=raise_missing) return keyarr, indexer def _getitem_iterable(self, key, axis: int): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 39e0aa078638f..847f543ebca4d 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -766,16 +766,14 @@ def copy_func(ax): res.axes = new_axes return res - def as_array(self, transpose=False, items=None): - """Convert the blockmanager data into an numpy array. + def as_array(self, transpose: bool = False) -> np.ndarray: + """ + Convert the blockmanager data into an numpy array. Parameters ---------- transpose : boolean, default False If True, transpose the return array - items : list of strings or None - Names of block items that will be included in the returned - array. ``None`` means that all block items will be used Returns ------- @@ -785,10 +783,7 @@ def as_array(self, transpose=False, items=None): arr = np.empty(self.shape, dtype=float) return arr.transpose() if transpose else arr - if items is not None: - mgr = self.reindex_axis(items, axis=0) - else: - mgr = self + mgr = self if self._is_single_block and mgr.blocks[0].is_datetimetz: # TODO(Block.get_values): Make DatetimeTZBlock.get_values From df2a3e926b1dc7f1642d67e7d1bddb5af5482c32 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Sat, 18 Jan 2020 23:52:20 +0800 Subject: [PATCH 092/158] DOC: moved whatsnew to V1.1.0 --- doc/source/whatsnew/v1.0.0.rst | 1 - doc/source/whatsnew/v1.1.0.rst | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index ed508b609fc09..fa562838c8f7c 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -1112,7 +1112,6 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` when using nunique on axis=1 (:issue:`30253`) - Bug in :meth:`GroupBy.quantile` with multiple list-like q value and integer column names (:issue:`30289`) - Bug in :meth:`GroupBy.pct_change` and :meth:`core.groupby.SeriesGroupBy.pct_change` causes ``TypeError`` when ``fill_method`` is ``None`` (:issue:`30463`) -- Bug in :meth:`Rolling.count` and :meth:`Expanding.count` argument ``min_periods`` ignored (:issue:`26996`) Reshaping ^^^^^^^^^ diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b5a7b19f160a4..e18a6331751e5 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -133,9 +133,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- -- - +- Bug in :meth:`Rolling.count` and :meth:`Expanding.count` argument ``min_periods`` ignored (:issue:`26996`) Reshaping ^^^^^^^^^ From 65b23c2213aaef2f303fa4410d9a48e6f2c83b15 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 18 Jan 2020 07:56:36 -0800 Subject: [PATCH 093/158] BUG: partial-timestamp slicing near the end of year/quarter/month (#31064) --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/indexes/datetimes.py | 27 ++++++++++++------- .../indexes/datetimes/test_partial_slicing.py | 20 ++++++++++++++ 3 files changed, 38 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 8133e54c934ad..b5106a1b1a7a0 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -102,7 +102,7 @@ Interval Indexing ^^^^^^^^ - +- Bug in slicing on a :class:`DatetimeIndex` with a partial-timestamp dropping high-resolution indices near the end of a year, quarter, or month (:issue:`31064`) - - diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 942b51eda7d0b..ad93748493390 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -5,7 +5,14 @@ import numpy as np -from pandas._libs import NaT, Timestamp, index as libindex, lib, tslib as libts +from pandas._libs import ( + NaT, + Timedelta, + Timestamp, + index as libindex, + lib, + tslib as libts, +) from pandas._libs.tslibs import ccalendar, fields, parsing, timezones from pandas.util._decorators import Appender, Substitution, cache_readonly @@ -31,7 +38,7 @@ import pandas.core.tools.datetimes as tools from pandas.tseries.frequencies import Resolution, to_offset -from pandas.tseries.offsets import Nano, prefix_mapping +from pandas.tseries.offsets import prefix_mapping def _new_DatetimeIndex(cls, d): @@ -519,27 +526,27 @@ def _parsed_string_to_bounds(self, reso, parsed): raise KeyError if reso == "year": start = Timestamp(parsed.year, 1, 1) - end = Timestamp(parsed.year, 12, 31, 23, 59, 59, 999999) + end = Timestamp(parsed.year + 1, 1, 1) - Timedelta(nanoseconds=1) elif reso == "month": d = ccalendar.get_days_in_month(parsed.year, parsed.month) start = Timestamp(parsed.year, parsed.month, 1) - end = Timestamp(parsed.year, parsed.month, d, 23, 59, 59, 999999) + end = start + Timedelta(days=d, nanoseconds=-1) elif reso == "quarter": qe = (((parsed.month - 1) + 2) % 12) + 1 # two months ahead d = ccalendar.get_days_in_month(parsed.year, qe) # at end of month start = Timestamp(parsed.year, parsed.month, 1) - end = Timestamp(parsed.year, qe, d, 23, 59, 59, 999999) + end = Timestamp(parsed.year, qe, 1) + Timedelta(days=d, nanoseconds=-1) elif reso == "day": start = Timestamp(parsed.year, parsed.month, parsed.day) - end = start + timedelta(days=1) - Nano(1) + end = start + Timedelta(days=1, nanoseconds=-1) elif reso == "hour": start = Timestamp(parsed.year, parsed.month, parsed.day, parsed.hour) - end = start + timedelta(hours=1) - Nano(1) + end = start + Timedelta(hours=1, nanoseconds=-1) elif reso == "minute": start = Timestamp( parsed.year, parsed.month, parsed.day, parsed.hour, parsed.minute ) - end = start + timedelta(minutes=1) - Nano(1) + end = start + Timedelta(minutes=1, nanoseconds=-1) elif reso == "second": start = Timestamp( parsed.year, @@ -549,7 +556,7 @@ def _parsed_string_to_bounds(self, reso, parsed): parsed.minute, parsed.second, ) - end = start + timedelta(seconds=1) - Nano(1) + end = start + Timedelta(seconds=1, nanoseconds=-1) elif reso == "microsecond": start = Timestamp( parsed.year, @@ -560,7 +567,7 @@ def _parsed_string_to_bounds(self, reso, parsed): parsed.second, parsed.microsecond, ) - end = start + timedelta(microseconds=1) - Nano(1) + end = start + Timedelta(microseconds=1, nanoseconds=-1) # GH 24076 # If an incoming date string contained a UTC offset, need to localize # the parsed date to this offset first before aligning with the index's diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index e30cc4449e01e..946d658e90132 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -142,6 +142,26 @@ def test_slice_year(self): expected = slice(3288, 3653) assert result == expected + @pytest.mark.parametrize( + "partial_dtime", + [ + "2019", + "2019Q4", + "Dec 2019", + "2019-12-31", + "2019-12-31 23", + "2019-12-31 23:59", + ], + ) + def test_slice_end_of_period_resolution(self, partial_dtime): + # GH#31064 + dti = date_range("2019-12-31 23:59:55.999999999", periods=10, freq="s") + + ser = pd.Series(range(10), index=dti) + result = ser[partial_dtime] + expected = ser.iloc[:5] + tm.assert_series_equal(result, expected) + def test_slice_quarter(self): dti = date_range(freq="D", start=datetime(2000, 6, 1), periods=500) From 8b754fcd097816eefeb9072e3a882aaf694c518a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 18 Jan 2020 07:59:27 -0800 Subject: [PATCH 094/158] CLN: update _simple_new usages (#31089) --- pandas/core/indexes/datetimes.py | 4 ++-- pandas/core/indexes/timedeltas.py | 4 ++-- pandas/core/tools/datetimes.py | 10 ++++++---- pandas/io/pytables.py | 8 +++++--- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index ad93748493390..8dc9ff869578e 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -260,7 +260,7 @@ def __new__( ambiguous=ambiguous, ) - subarr = cls._simple_new(dtarr, name=name, freq=dtarr.freq, tz=dtarr.tz) + subarr = cls._simple_new(dtarr, name=name) return subarr @classmethod @@ -1170,7 +1170,7 @@ def date_range( closed=closed, **kwargs, ) - return DatetimeIndex._simple_new(dtarr, tz=dtarr.tz, freq=dtarr.freq, name=name) + return DatetimeIndex._simple_new(dtarr, name=name) def bdate_range( diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 582c257b50ad0..9dba87f67c41d 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -177,7 +177,7 @@ def __new__( tdarr = TimedeltaArray._from_sequence( data, freq=freq, unit=unit, dtype=dtype, copy=copy ) - return cls._simple_new(tdarr._data, freq=tdarr.freq, name=name) + return cls._simple_new(tdarr, name=name) @classmethod def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE): @@ -507,4 +507,4 @@ def timedelta_range( freq, freq_infer = dtl.maybe_infer_freq(freq) tdarr = TimedeltaArray._generate_range(start, end, periods, freq, closed=closed) - return TimedeltaIndex._simple_new(tdarr._data, freq=tdarr.freq, name=name) + return TimedeltaIndex._simple_new(tdarr, name=name) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 84c17748c503c..3a9d0623ff4a6 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -37,9 +37,10 @@ ) from pandas.core.dtypes.missing import notna -from pandas.arrays import IntegerArray +from pandas.arrays import DatetimeArray, IntegerArray from pandas.core import algorithms from pandas.core.algorithms import unique +from pandas.core.arrays.datetimes import tz_to_dtype # --------------------------------------------------------------------- # types used in annotations @@ -282,7 +283,6 @@ def _convert_listlike_datetimes( Index-like of parsed dates """ from pandas import DatetimeIndex - from pandas.core.arrays import DatetimeArray from pandas.core.arrays.datetimes import ( maybe_convert_dtype, objects_to_datetime64ns, @@ -427,7 +427,8 @@ def _convert_listlike_datetimes( # datetime objects are found without passing `utc=True` try: values, tz = conversion.datetime_to_datetime64(arg) - return DatetimeIndex._simple_new(values, name=name, tz=tz) + dta = DatetimeArray(values, dtype=tz_to_dtype(tz)) + return DatetimeIndex._simple_new(dta, name=name) except (ValueError, TypeError): raise e @@ -447,7 +448,8 @@ def _convert_listlike_datetimes( if tz_parsed is not None: # We can take a shortcut since the datetime64 numpy array # is in UTC - return DatetimeIndex._simple_new(result, name=name, tz=tz_parsed) + dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed)) + return DatetimeIndex._simple_new(dta, name=name) utc = tz == "utc" return _box_as_indexlike(result, utc=utc, name=name) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 9e8d8a2e89f20..3e4673c890bef 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -58,7 +58,7 @@ concat, isna, ) -from pandas.core.arrays.categorical import Categorical +from pandas.core.arrays import Categorical, DatetimeArray, PeriodArray import pandas.core.common as com from pandas.core.computation.pytables import PyTablesExpr, maybe_expression from pandas.core.indexes.api import ensure_index @@ -2656,7 +2656,8 @@ def _get_index_factory(self, klass): def f(values, freq=None, tz=None): # data are already in UTC, localize and convert if tz present - result = DatetimeIndex._simple_new(values.values, name=None, freq=freq) + dta = DatetimeArray._simple_new(values.values, freq=freq) + result = DatetimeIndex._simple_new(dta, name=None) if tz is not None: result = result.tz_localize("UTC").tz_convert(tz) return result @@ -2665,7 +2666,8 @@ def f(values, freq=None, tz=None): elif klass == PeriodIndex: def f(values, freq=None, tz=None): - return PeriodIndex._simple_new(values, name=None, freq=freq) + parr = PeriodArray._simple_new(values, freq=freq) + return PeriodIndex._simple_new(parr, name=None) return f From 024925a4eeae5e650977922767be4755f6756a94 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 18 Jan 2020 08:03:17 -0800 Subject: [PATCH 095/158] REF: handle searchsorted casting within DatetimeLikeArray (#30950) --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/arrays/datetimelike.py | 27 +++++++++++++++++++---- pandas/core/indexes/datetimes.py | 17 +++----------- pandas/core/indexes/period.py | 12 ---------- pandas/core/indexes/timedeltas.py | 17 +++----------- pandas/tests/arrays/test_datetimes.py | 20 ++++++----------- pandas/tests/arrays/test_timedeltas.py | 20 ++++++----------- pandas/tests/indexes/period/test_tools.py | 7 +++++- 8 files changed, 50 insertions(+), 72 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b5106a1b1a7a0..82a3aa6e032b6 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -60,7 +60,7 @@ Categorical Datetimelike ^^^^^^^^^^^^ - Bug in :class:`Timestamp` where constructing :class:`Timestamp` from ambiguous epoch time and calling constructor again changed :meth:`Timestamp.value` property (:issue:`24329`) -- +- :meth:`DatetimeArray.searchsorted`, :meth:`TimedeltaArray.searchsorted`, :meth:`PeriodArray.searchsorted` not recognizing non-pandas scalars and incorrectly raising ``ValueError`` instead of ``TypeError`` (:issue:`30950`) - Timedelta diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index d7c508c890a46..70637026c278d 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -743,17 +743,36 @@ def searchsorted(self, value, side="left", sorter=None): Array of insertion points with the same shape as `value`. """ if isinstance(value, str): - value = self._scalar_from_string(value) + try: + value = self._scalar_from_string(value) + except ValueError: + raise TypeError("searchsorted requires compatible dtype or scalar") + + elif is_valid_nat_for_dtype(value, self.dtype): + value = NaT + + elif isinstance(value, self._recognized_scalars): + value = self._scalar_type(value) + + elif isinstance(value, np.ndarray): + if not type(self)._is_recognized_dtype(value): + raise TypeError( + "searchsorted requires compatible dtype or scalar, " + f"not {type(value).__name__}" + ) + value = type(self)(value) + self._check_compatible_with(value) - if not (isinstance(value, (self._scalar_type, type(self))) or isna(value)): - raise ValueError(f"Unexpected type for 'value': {type(value)}") + if not (isinstance(value, (self._scalar_type, type(self))) or (value is NaT)): + raise TypeError(f"Unexpected type for 'value': {type(value)}") - self._check_compatible_with(value) if isinstance(value, type(self)): + self._check_compatible_with(value) value = value.asi8 else: value = self._unbox_scalar(value) + # TODO: Use datetime64 semantics for sorting, xref GH#29844 return self.asi8.searchsorted(value, side=side, sorter=sorter) def repeat(self, repeats, *args, **kwargs): diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 8dc9ff869578e..e8935950cd42d 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -833,24 +833,13 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): @Substitution(klass="DatetimeIndex") @Appender(_shared_docs["searchsorted"]) def searchsorted(self, value, side="left", sorter=None): - if isinstance(value, (np.ndarray, Index)): - if not type(self._data)._is_recognized_dtype(value): - raise TypeError( - "searchsorted requires compatible dtype or scalar, " - f"not {type(value).__name__}" - ) - value = type(self._data)(value) - self._data._check_compatible_with(value) - - elif isinstance(value, self._data._recognized_scalars): - self._data._check_compatible_with(value) - value = self._data._scalar_type(value) - - elif not isinstance(value, DatetimeArray): + if isinstance(value, str): raise TypeError( "searchsorted requires compatible dtype or scalar, " f"not {type(value).__name__}" ) + if isinstance(value, Index): + value = value._data return self._data.searchsorted(value, side=side) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index a54d09e8bede0..bb7f6fb65adfc 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -470,18 +470,6 @@ def astype(self, dtype, copy=True, how="start"): @Substitution(klass="PeriodIndex") @Appender(_shared_docs["searchsorted"]) def searchsorted(self, value, side="left", sorter=None): - if isinstance(value, Period) or value is NaT: - self._data._check_compatible_with(value) - elif isinstance(value, str): - try: - value = Period(value, freq=self.freq) - except DateParseError: - raise KeyError(f"Cannot interpret '{value}' as period") - elif not isinstance(value, PeriodArray): - raise TypeError( - "PeriodIndex.searchsorted requires either a Period or PeriodArray" - ) - return self._data.searchsorted(value, side=side, sorter=sorter) @property diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 9dba87f67c41d..41ade4d2fc1d8 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -347,24 +347,13 @@ def _partial_td_slice(self, key): @Substitution(klass="TimedeltaIndex") @Appender(_shared_docs["searchsorted"]) def searchsorted(self, value, side="left", sorter=None): - if isinstance(value, (np.ndarray, Index)): - if not type(self._data)._is_recognized_dtype(value): - raise TypeError( - "searchsorted requires compatible dtype or scalar, " - f"not {type(value).__name__}" - ) - value = type(self._data)(value) - self._data._check_compatible_with(value) - - elif isinstance(value, self._data._recognized_scalars): - self._data._check_compatible_with(value) - value = self._data._scalar_type(value) - - elif not isinstance(value, TimedeltaArray): + if isinstance(value, str): raise TypeError( "searchsorted requires compatible dtype or scalar, " f"not {type(value).__name__}" ) + if isinstance(value, Index): + value = value._data return self._data.searchsorted(value, side=side, sorter=sorter) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 5608ab5fbd9db..a59ed429cc404 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -331,25 +331,19 @@ def test_searchsorted_tzawareness_compat(self, index): pd.Timestamp.now().to_period("D"), ], ) - @pytest.mark.parametrize( - "index", - [ - True, - pytest.param( - False, - marks=pytest.mark.xfail( - reason="Raises ValueError instead of TypeError", raises=ValueError - ), - ), - ], - ) + @pytest.mark.parametrize("index", [True, False]) def test_searchsorted_invalid_types(self, other, index): data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 arr = DatetimeArray(data, freq="D") if index: arr = pd.Index(arr) - msg = "searchsorted requires compatible dtype or scalar" + msg = "|".join( + [ + "searchsorted requires compatible dtype or scalar", + "Unexpected type for 'value'", + ] + ) with pytest.raises(TypeError, match=msg): arr.searchsorted(other) diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index 62cb4766171a4..c86b4f71ee592 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -154,25 +154,19 @@ def test_setitem_objects(self, obj): pd.Timestamp.now().to_period("D"), ], ) - @pytest.mark.parametrize( - "index", - [ - True, - pytest.param( - False, - marks=pytest.mark.xfail( - reason="Raises ValueError instead of TypeError", raises=ValueError - ), - ), - ], - ) + @pytest.mark.parametrize("index", [True, False]) def test_searchsorted_invalid_types(self, other, index): data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 arr = TimedeltaArray(data, freq="D") if index: arr = pd.Index(arr) - msg = "searchsorted requires compatible dtype or scalar" + msg = "|".join( + [ + "searchsorted requires compatible dtype or scalar", + "Unexpected type for 'value'", + ] + ) with pytest.raises(TypeError, match=msg): arr.searchsorted(other) diff --git a/pandas/tests/indexes/period/test_tools.py b/pandas/tests/indexes/period/test_tools.py index 28ab14af71362..23350fdff4b78 100644 --- a/pandas/tests/indexes/period/test_tools.py +++ b/pandas/tests/indexes/period/test_tools.py @@ -249,7 +249,12 @@ def test_searchsorted_invalid(self): other = np.array([0, 1], dtype=np.int64) - msg = "requires either a Period or PeriodArray" + msg = "|".join( + [ + "searchsorted requires compatible dtype or scalar", + "Unexpected type for 'value'", + ] + ) with pytest.raises(TypeError, match=msg): pidx.searchsorted(other) From 55cfabb630b2c205bd25fcaf999d85beaf7f7163 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 18 Jan 2020 08:17:31 -0800 Subject: [PATCH 096/158] REF: be stricter about what we pass to _simple_new (#31055) --- pandas/core/arrays/period.py | 3 ++- pandas/core/indexes/base.py | 12 ++---------- pandas/core/indexes/datetimes.py | 4 ---- pandas/core/indexes/interval.py | 2 ++ pandas/core/indexes/numeric.py | 3 ++- pandas/core/indexes/period.py | 2 -- pandas/core/indexes/range.py | 2 +- pandas/core/indexes/timedeltas.py | 4 +++- 8 files changed, 12 insertions(+), 20 deletions(-) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 1e2a02e988fdd..d9b53aa4a867c 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -169,8 +169,9 @@ def __init__(self, values, freq=None, dtype=None, copy=False): self._dtype = PeriodDtype(freq) @classmethod - def _simple_new(cls, values, freq=None, **kwargs): + def _simple_new(cls, values: np.ndarray, freq=None, **kwargs): # alias for PeriodArray.__init__ + assert isinstance(values, np.ndarray) and values.dtype == "i8" return cls(values, freq=freq, **kwargs) @classmethod diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 08629d9a61707..a630938afeb8a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -50,7 +50,6 @@ from pandas.core.dtypes.generic import ( ABCCategorical, ABCDataFrame, - ABCDatetimeArray, ABCDatetimeIndex, ABCIndexClass, ABCIntervalIndex, @@ -460,11 +459,7 @@ def _simple_new(cls, values, name=None, dtype=None): Must be careful not to recurse. """ - if isinstance(values, (ABCSeries, ABCIndexClass)): - # Index._data must always be an ndarray. - # This is no-copy for when _values is an ndarray, - # which should be always at this point. - values = np.asarray(values._values) + assert isinstance(values, np.ndarray), type(values) result = object.__new__(cls) result._data = values @@ -510,6 +505,7 @@ def _get_attributes_dict(self): def _shallow_copy(self, values=None, **kwargs): if values is None: values = self.values + attributes = self._get_attributes_dict() attributes.update(kwargs) if not len(values) and "dtype" not in kwargs: @@ -517,10 +513,6 @@ def _shallow_copy(self, values=None, **kwargs): # _simple_new expects an the type of self._data values = getattr(values, "_values", values) - if isinstance(values, ABCDatetimeArray): - # `self.values` returns `self` for tz-aware, so we need to unwrap - # more specifically - values = values.asi8 return self._simple_new(values, **attributes) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index e8935950cd42d..91b9aa63c6a8e 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -281,10 +281,6 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None): freq = values.freq values = values._data - # DatetimeArray._simple_new will accept either i8 or M8[ns] dtypes - if isinstance(values, DatetimeIndex): - values = values._data - dtype = tz_to_dtype(tz) dtarr = DatetimeArray._simple_new(values, freq=freq, dtype=dtype) assert isinstance(dtarr, DatetimeArray) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 1b851ca38459a..523d6404f5efa 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -259,6 +259,8 @@ def _simple_new(cls, array, name, closed=None): closed : Any Ignored. """ + assert isinstance(array, IntervalArray), type(array) + result = IntervalMixin.__new__(cls) result._data = array result.name = name diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index c5fca8652fed4..566341d78c7ed 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -57,6 +57,7 @@ class NumericIndex(Index): def __new__(cls, data=None, dtype=None, copy=False, name=None): cls._validate_dtype(dtype) + name = maybe_extract_name(name, data, cls) # Coerce to ndarray if not already ndarray or Index if not isinstance(data, (np.ndarray, Index)): @@ -82,7 +83,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None): # GH#13601, GH#20285, GH#27125 raise ValueError("Index data must be 1-dimensional") - name = maybe_extract_name(name, data, cls) + subarr = np.asarray(subarr) return cls._simple_new(subarr, name=name) @classmethod diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index bb7f6fb65adfc..35f96e61704f0 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -250,8 +250,6 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs): freq = Period._maybe_convert_freq(freq) values = PeriodArray(values, freq=freq) - if not isinstance(values, PeriodArray): - raise TypeError("PeriodIndex._simple_new only accepts PeriodArray") result = object.__new__(cls) result._data = values # For groupby perf. See note in indexes/base about _index_data diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 67eb5c26fc83a..336f65ca574dc 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -114,7 +114,7 @@ def __new__( return cls._simple_new(rng, dtype=dtype, name=name) @classmethod - def from_range(cls, data, name=None, dtype=None): + def from_range(cls, data: range, name=None, dtype=None) -> "RangeIndex": """ Create RangeIndex from a range object. diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 41ade4d2fc1d8..e7427438828a8 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -183,6 +183,7 @@ def __new__( def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE): # `dtype` is passed by _shallow_copy in corner cases, should always # be timedelta64[ns] if present + if not isinstance(values, TimedeltaArray): values = TimedeltaArray._simple_new(values, dtype=dtype, freq=freq) else: @@ -409,7 +410,8 @@ def insert(self, loc, item): new_i8s = np.concatenate( (self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8) ) - return self._shallow_copy(new_i8s, freq=freq) + tda = type(self._data)._simple_new(new_i8s, freq=freq) + return self._shallow_copy(tda) except (AttributeError, TypeError): # fall back to object index From f792d8c50ee456aa8aa2ae406d8e6b8843f45614 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Sat, 18 Jan 2020 17:23:31 +0100 Subject: [PATCH 097/158] BUG: correct wrong error message in df.pivot when columns=None (#30925) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/reshape/pivot.py | 3 +++ pandas/tests/reshape/test_pivot.py | 9 +++++++++ 3 files changed, 13 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 82a3aa6e032b6..1cd325dad9f07 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -142,6 +142,7 @@ Reshaping - - Bug in :meth:`DataFrame.pivot_table` when only MultiIndexed columns is set (:issue:`17038`) +- Fix incorrect error message in :meth:`DataFrame.pivot` when ``columns`` is set to ``None``. (:issue:`30924`) - Bug in :func:`crosstab` when inputs are two Series and have tuple names, the output will keep dummy MultiIndex as columns. (:issue:`18321`) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 13df39cc0011b..930ff5f454a7b 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -429,6 +429,9 @@ def _convert_by(by): @Substitution("\ndata : DataFrame") @Appender(_shared_docs["pivot"], indents=1) def pivot(data: "DataFrame", index=None, columns=None, values=None) -> "DataFrame": + if columns is None: + raise TypeError("pivot() missing 1 required argument: 'columns'") + if values is None: cols = [columns] if index is None else [index, columns] append = index is None diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index a2e6a19996668..44073f56abfa1 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -781,6 +781,15 @@ def test_pivot_with_list_like_values_nans(self, values, method): expected = DataFrame(data=data, index=index, columns=columns, dtype="object") tm.assert_frame_equal(result, expected) + def test_pivot_columns_none_raise_error(self): + # GH 30924 + df = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1, 2, 3], "col3": [1, 2, 3]} + ) + msg = r"pivot\(\) missing 1 required argument: 'columns'" + with pytest.raises(TypeError, match=msg): + df.pivot(index="col1", values="col3") + @pytest.mark.xfail( reason="MultiIndexed unstack with tuple names fails with KeyError GH#19966" ) From 6245c709c3ab62ed26a3239f96ddd7a092fb4ece Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Sat, 18 Jan 2020 18:33:47 +0200 Subject: [PATCH 098/158] Updated years in LICENSE (#31100) --- LICENSE | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 924de26253bf4..76954a5a339ab 100644 --- a/LICENSE +++ b/LICENSE @@ -1,8 +1,10 @@ BSD 3-Clause License -Copyright (c) 2008-2012, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team +Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team All rights reserved. +Copyright (c) 2011-2020, Open source contributors. + Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: From d16dc5af3ed6712e3d1bbf1146ea8764a7336e0e Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 18 Jan 2020 08:37:39 -0800 Subject: [PATCH 099/158] Pull Request Tips (#31093) --- doc/source/development/contributing.rst | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 4fdcb93745094..2dcb6a32d7941 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -1525,3 +1525,19 @@ The branch will still exist on GitHub, so to delete it there do:: git push origin --delete shiny-new-feature .. _Gitter: https://gitter.im/pydata/pandas + + +Tips for a successful Pull Request +================================== + +If you have made it to the `Review your code`_ phase, one of the core contributors may +take a look. Please note however that a handful of people are responsible for reviewing +all of the contributions, which can often lead to bottlenecks. + +To improve the chances of your pull request being reviewed, you should: + +- **Reference an open issue** for non-trivial changes to clarify the PR's purpose +- **Ensure you have appropriate tests**. These should be the first part of any PR +- **Keep your pull requests as simple as possible**. Larger PRs take longer to review +- **Ensure that CI is in a green state**. Reviewers may not even look otherwise +- **Keep** `Updating your pull request`_, either by request or every few days From a3c772278b13cce7fceb29e4712f4d1d35cd142e Mon Sep 17 00:00:00 2001 From: saloni30 Date: Sat, 18 Jan 2020 22:11:11 +0530 Subject: [PATCH 100/158] DOC: Moved PANDAS_TESTING_MODE tip to .travis.yml (#30694) (#31008) --- .travis.yml | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/.travis.yml b/.travis.yml index a23bc8a4e905f..2c8533d02ddc1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,10 +7,10 @@ python: 3.7 # travis cache --delete inside the project directory from the travis command line client # The cache directories will be deleted if anything in ci/ changes in a commit cache: - ccache: true - directories: - - $HOME/.cache # cython cache - - $HOME/.ccache # compiler cache + ccache: true + directories: + - $HOME/.cache # cython cache + - $HOME/.ccache # compiler cache env: global: @@ -20,13 +20,13 @@ env: - secure: "EkWLZhbrp/mXJOx38CHjs7BnjXafsqHtwxPQrqWy457VDFWhIY1DMnIR/lOWG+a20Qv52sCsFtiZEmMfUjf0pLGXOqurdxbYBGJ7/ikFLk9yV2rDwiArUlVM9bWFnFxHvdz9zewBH55WurrY4ShZWyV+x2dWjjceWG5VpWeI6sA=" git: - # for cloning - depth: false + # for cloning + depth: false matrix: - fast_finish: true + fast_finish: true - include: + include: - env: - JOB="3.8" ENV_FILE="ci/deps/travis-38.yaml" PATTERN="(not slow and not network and not clipboard)" @@ -40,6 +40,9 @@ matrix: - postgresql - env: + # Enabling Deprecations when running tests + # PANDAS_TESTING_MODE="deprecate" causes DeprecationWarning messages to be displayed in the logs + # See pandas/_testing.py for more details. - JOB="3.6, coverage" ENV_FILE="ci/deps/travis-36-cov.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" PANDAS_TESTING_MODE="deprecate" COVERAGE=true SQL="1" services: - mysql @@ -70,7 +73,6 @@ before_install: # This overrides travis and tells it to look nowhere. - export BOTO_CONFIG=/dev/null - install: - echo "install start" - ci/prep_cython_cache.sh @@ -87,5 +89,5 @@ script: after_script: - echo "after_script start" - source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd - - ci/print_skipped.py + - ci/print_skipped.py - echo "after_script done" From cdf67749dd15f61b8761355de42c3d5ceef3ac74 Mon Sep 17 00:00:00 2001 From: Harshavardhan Bachina Date: Sat, 18 Jan 2020 11:11:20 -0600 Subject: [PATCH 101/158] DOC: Replace ggpy with plotnine in ecosystem (#31097) --- doc/source/ecosystem.rst | 10 ++++------ web/pandas/community/ecosystem.md | 11 ++++------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index be61b83d46a26..90f839897ce4b 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -122,16 +122,14 @@ also goes beyond matplotlib and pandas with the option to perform statistical estimation while plotting, aggregating across observations and visualizing the fit of statistical models to emphasize patterns in a dataset. -`yhat/ggpy `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`plotnine `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Hadley Wickham's `ggplot2 `__ is a foundational exploratory visualization package for the R language. Based on `"The Grammar of Graphics" `__ it provides a powerful, declarative and extremely general way to generate bespoke plots of any kind of data. -It's really quite incredible. Various implementations to other languages are available, -but a faithful implementation for Python users has long been missing. Although still young -(as of Jan-2014), the `yhat/ggpy `__ project has been -progressing quickly in that direction. +Various implementations to other languages are available. +A good implementation for Python users is `has2k1/plotnine `__. `IPython Vega `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index af6fd1ac77605..a707854c6ed2c 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -84,19 +84,16 @@ pandas with the option to perform statistical estimation while plotting, aggregating across observations and visualizing the fit of statistical models to emphasize patterns in a dataset. -### [yhat/ggpy](https://github.com/yhat/ggpy) +### [plotnine](https://github.com/has2k1/plotnine/) Hadley Wickham's [ggplot2](https://ggplot2.tidyverse.org/) is a foundational exploratory visualization package for the R language. Based on ["The Grammar of Graphics"](https://www.cs.uic.edu/~wilkinson/TheGrammarOfGraphics/GOG.html) it provides a powerful, declarative and extremely general way to -generate bespoke plots of any kind of data. It's really quite -incredible. Various implementations to other languages are available, -but a faithful implementation for Python users has long been missing. -Although still young (as of Jan-2014), the -[yhat/ggpy](https://github.com/yhat/ggpy) project has been progressing -quickly in that direction. +generate bespoke plots of any kind of data. +Various implementations to other languages are available. +A good implementation for Python users is [has2k1/plotnine](https://github.com/has2k1/plotnine/). ### [IPython Vega](https://github.com/vega/ipyvega) From d170cc05ed7fc3579384cec328f29efc169e5c13 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 18 Jan 2020 09:20:03 -0800 Subject: [PATCH 102/158] TST/BUG: fix incorrectly-passing Exception (#30553) --- pandas/io/html.py | 11 ++++++++--- pandas/tests/io/test_html.py | 8 ++++---- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 809ce77eef0bb..75cb0fafaa6b3 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -591,9 +591,14 @@ def _setup_build_doc(self): def _build_doc(self): from bs4 import BeautifulSoup - return BeautifulSoup( - self._setup_build_doc(), features="html5lib", from_encoding=self.encoding - ) + bdoc = self._setup_build_doc() + if isinstance(bdoc, bytes) and self.encoding is not None: + udoc = bdoc.decode(self.encoding) + from_encoding = None + else: + udoc = bdoc + from_encoding = self.encoding + return BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding) def _build_xpath_expr(attrs) -> str: diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 626df839363cb..7a814ce82fd73 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1158,9 +1158,9 @@ def test_displayed_only(self, displayed_only, exp0, exp1): assert len(dfs) == 1 # Should not parse hidden table def test_encode(self, html_encoding_file): - _, encoding = os.path.splitext(os.path.basename(html_encoding_file))[0].split( - "_" - ) + base_path = os.path.basename(html_encoding_file) + root = os.path.splitext(base_path)[0] + _, encoding = root.split("_") try: with open(html_encoding_file, "rb") as fobj: @@ -1183,7 +1183,7 @@ def test_encode(self, html_encoding_file): if is_platform_windows(): if "16" in encoding or "32" in encoding: pytest.skip() - raise + raise def test_parse_failure_unseekable(self): # Issue #17975 From 52c22b23a617148d229f946ce8301d1fc68c3ec4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 18 Jan 2020 11:39:00 -0800 Subject: [PATCH 103/158] REF/CLN: Index.get_value wrapping incorrectly (#31125) --- pandas/core/indexes/base.py | 19 +++++++++---------- pandas/core/series.py | 12 ------------ 2 files changed, 9 insertions(+), 22 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a630938afeb8a..5ce2b06ed7dbd 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4639,7 +4639,8 @@ def get_value(self, series, key): k = self._convert_scalar_indexer(key, kind="getitem") try: - return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None)) + loc = self._engine.get_loc(k) + except KeyError as e1: if len(self) > 0 and (self.holds_integer() or self.is_boolean()): raise @@ -4648,19 +4649,17 @@ def get_value(self, series, key): return libindex.get_value_at(s, key) except IndexError: raise - except TypeError: - # generator/iterator-like - if is_iterator(key): - raise InvalidIndexError(key) - else: - raise e1 except Exception: raise e1 except TypeError: # e.g. "[False] is an invalid key" - if is_scalar(key): - raise IndexError(key) - raise InvalidIndexError(key) + raise IndexError(key) + + else: + if is_scalar(loc): + tz = getattr(series.dtype, "tz", None) + return libindex.get_value_at(s, loc, tz=tz) + return series.iloc[loc] def set_value(self, arr, key, value): """ diff --git a/pandas/core/series.py b/pandas/core/series.py index ec9475c6dcba9..580e3745136d7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -815,18 +815,6 @@ def __getitem__(self, key): try: result = self.index.get_value(self, key) - if not is_scalar(result): - if is_list_like(result) and not isinstance(result, Series): - - # we need to box if loc of the key isn't scalar here - # otherwise have inline ndarray/lists - try: - if not is_scalar(self.index.get_loc(key)): - result = self._constructor( - result, index=[key] * len(result), dtype=self.dtype - ).__finalize__(self) - except KeyError: - pass return result except InvalidIndexError: pass From a44697990b97fed64594dbca4c9da1b9e49536b2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 18 Jan 2020 15:24:03 -0800 Subject: [PATCH 104/158] REF: simplify Float64Index.get_loc (#31123) --- pandas/core/indexes/numeric.py | 23 +++++++++++---------- pandas/tests/indexes/multi/test_indexing.py | 3 ++- pandas/tests/indexes/test_numeric.py | 3 ++- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 566341d78c7ed..a8c2303d65361 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -488,17 +488,18 @@ def __contains__(self, other) -> bool: @Appender(_index_shared_docs["get_loc"]) def get_loc(self, key, method=None, tolerance=None): - try: - if np.all(np.isnan(key)) or is_bool(key): - nan_idxs = self._nan_idxs - try: - return nan_idxs.item() - except ValueError: - if not len(nan_idxs): - raise KeyError(key) - return nan_idxs - except (TypeError, NotImplementedError): - pass + if is_bool(key): + # Catch this to avoid accidentally casting to 1.0 + raise KeyError(key) + + if is_float(key) and np.isnan(key): + nan_idxs = self._nan_idxs + if not len(nan_idxs): + raise KeyError(key) + elif len(nan_idxs) == 1: + return nan_idxs[0] + return nan_idxs + return super().get_loc(key, method=method, tolerance=tolerance) @cache_readonly diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index ad6f06d065150..9070eb3deffb5 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -396,7 +396,8 @@ def test_get_loc_missing_nan(): idx.get_loc(3) with pytest.raises(KeyError, match=r"^nan$"): idx.get_loc(np.nan) - with pytest.raises(KeyError, match=r"^\[nan\]$"): + with pytest.raises(TypeError, match=r"'\[nan\]' is an invalid key"): + # listlike/non-hashable raises TypeError idx.get_loc([np.nan]) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 582f6c619d287..12cc51222e6bb 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -389,7 +389,8 @@ def test_get_loc_missing_nan(self): idx.get_loc(3) with pytest.raises(KeyError, match="^nan$"): idx.get_loc(np.nan) - with pytest.raises(KeyError, match=r"^\[nan\]$"): + with pytest.raises(TypeError, match=r"'\[nan\]' is an invalid key"): + # listlike/non-hashable raises TypeError idx.get_loc([np.nan]) def test_contains_nans(self): From 7d280409c40302437c37b520945615e5a1f90ffc Mon Sep 17 00:00:00 2001 From: Josh Dimarsky <24758845+yehoshuadimarsky@users.noreply.github.com> Date: Sat, 18 Jan 2020 20:00:58 -0500 Subject: [PATCH 105/158] ENH: Create DockerFile and devcontainer.json files to work with Docker and VS Code in Containers (#30638) Co-Authored-By: gfyoung Co-Authored-By: William Ayd --- .devcontainer.json | 28 +++++++++++++++ Dockerfile | 47 +++++++++++++++++++++++++ doc/source/development/contributing.rst | 11 ++++++ 3 files changed, 86 insertions(+) create mode 100644 .devcontainer.json create mode 100644 Dockerfile diff --git a/.devcontainer.json b/.devcontainer.json new file mode 100644 index 0000000000000..315a1ff647012 --- /dev/null +++ b/.devcontainer.json @@ -0,0 +1,28 @@ +// For format details, see https://aka.ms/vscode-remote/devcontainer.json or the definition README at +// https://github.com/microsoft/vscode-dev-containers/tree/master/containers/python-3-miniconda +{ + "name": "pandas", + "context": ".", + "dockerFile": "Dockerfile", + + // Use 'settings' to set *default* container specific settings.json values on container create. + // You can edit these settings after create using File > Preferences > Settings > Remote. + "settings": { + "terminal.integrated.shell.linux": "/bin/bash", + "python.condaPath": "/opt/conda/bin/conda", + "python.pythonPath": "/opt/conda/bin/python", + "python.formatting.provider": "black", + "python.linting.enabled": true, + "python.linting.flake8Enabled": true, + "python.linting.pylintEnabled": false, + "python.linting.mypyEnabled": true, + "python.testing.pytestEnabled": true, + "python.testing.cwd": "pandas/tests" + }, + + // Add the IDs of extensions you want installed when the container is created in the array below. + "extensions": [ + "ms-python.python", + "ms-vscode.cpptools" + ] +} diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000..b8aff5d671dcf --- /dev/null +++ b/Dockerfile @@ -0,0 +1,47 @@ +FROM continuumio/miniconda3 + +# if you forked pandas, you can pass in your own GitHub username to use your fork +# i.e. gh_username=myname +ARG gh_username=pandas-dev +ARG pandas_home="/home/pandas" + +# Avoid warnings by switching to noninteractive +ENV DEBIAN_FRONTEND=noninteractive + +# Configure apt and install packages +RUN apt-get update \ + && apt-get -y install --no-install-recommends apt-utils dialog 2>&1 \ + # + # Verify git, process tools, lsb-release (common in install instructions for CLIs) installed + && apt-get -y install git iproute2 procps iproute2 lsb-release \ + # + # Install C compilers (gcc not enough, so just went with build-essential which admittedly might be overkill), + # needed to build pandas C extensions + && apt-get -y install build-essential \ + # + # cleanup + && apt-get autoremove -y \ + && apt-get clean -y \ + && rm -rf /var/lib/apt/lists/* + +# Switch back to dialog for any ad-hoc use of apt-get +ENV DEBIAN_FRONTEND=dialog + +# Clone pandas repo +RUN mkdir "$pandas_home" \ + && git clone "https://github.com/$gh_username/pandas.git" "$pandas_home" \ + && cd "$pandas_home" \ + && git remote add upstream "https://github.com/pandas-dev/pandas.git" \ + && git pull upstream master + +# Because it is surprisingly difficult to activate a conda environment inside a DockerFile +# (from personal experience and per https://github.com/ContinuumIO/docker-images/issues/89), +# we just update the base/root one from the 'environment.yml' file instead of creating a new one. +# +# Set up environment +RUN conda env update -n base -f "$pandas_home/environment.yml" + +# Build C extensions and pandas +RUN cd "$pandas_home" \ + && python setup.py build_ext --inplace -j 4 \ + && python -m pip install -e . diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 2dcb6a32d7941..b650b2a2cf1fe 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -146,6 +146,17 @@ requires a C compiler and Python environment. If you're making documentation changes, you can skip to :ref:`contributing.documentation` but you won't be able to build the documentation locally before pushing your changes. +Using a Docker Container +~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of manually setting up a development environment, you can use Docker to +automatically create the environment with just several commands. Pandas provides a `DockerFile` +in the root directory to build a Docker image with a full pandas development environment. + +Even easier, you can use the DockerFile to launch a remote session with Visual Studio Code, +a popular free IDE, using the `.devcontainer.json` file. +See https://code.visualstudio.com/docs/remote/containers for details. + .. _contributing.dev_c: Installing a C compiler From 31a7f576913e8c84774645089acc8e0b5b81128a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 19 Jan 2020 12:54:13 -0800 Subject: [PATCH 106/158] REF: share code between Int64Index and UInt64Index (#31129) --- pandas/core/indexes/numeric.py | 47 ++++++++++------------------------ 1 file changed, 14 insertions(+), 33 deletions(-) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index a8c2303d65361..def77ffbea591 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -231,6 +231,8 @@ class IntegerIndex(NumericIndex): This is an abstract class for Int64Index, UInt64Index. """ + _default_dtype: np.dtype + def __contains__(self, key) -> bool: """ Check if key is a float and has a decimal. If it has, return False. @@ -243,26 +245,17 @@ def __contains__(self, key) -> bool: except (OverflowError, TypeError, ValueError): return False - -class Int64Index(IntegerIndex): - __doc__ = _num_index_shared_docs["class_descr"] % _int64_descr_args - - _typ = "int64index" - _can_hold_na = False - _engine_type = libindex.Int64Engine - _default_dtype = np.int64 - @property def inferred_type(self) -> str: """ - Always 'integer' for ``Int64Index`` + Always 'integer' for ``Int64Index`` and ``UInt64Index`` """ return "integer" @property def asi8(self) -> np.ndarray: # do not cache or you'll create a memory leak - return self.values.view("i8") + return self.values.view(self._default_dtype) @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): @@ -273,6 +266,15 @@ def _convert_scalar_indexer(self, key, kind=None): key = self._maybe_cast_indexer(key) return super()._convert_scalar_indexer(key, kind=kind) + +class Int64Index(IntegerIndex): + __doc__ = _num_index_shared_docs["class_descr"] % _int64_descr_args + + _typ = "int64index" + _can_hold_na = False + _engine_type = libindex.Int64Engine + _default_dtype = np.dtype(np.int64) + def _wrap_joined_index(self, joined, other): name = get_op_result_name(self, other) return Int64Index(joined, name=name) @@ -307,28 +309,7 @@ class UInt64Index(IntegerIndex): _typ = "uint64index" _can_hold_na = False _engine_type = libindex.UInt64Engine - _default_dtype = np.uint64 - - @property - def inferred_type(self) -> str: - """ - Always 'integer' for ``UInt64Index`` - """ - return "integer" - - @property - def asi8(self) -> np.ndarray: - # do not cache or you'll create a memory leak - return self.values.view("u8") - - @Appender(_index_shared_docs["_convert_scalar_indexer"]) - def _convert_scalar_indexer(self, key, kind=None): - assert kind in ["loc", "getitem", "iloc", None] - - # don't coerce ilocs to integers - if kind != "iloc": - key = self._maybe_cast_indexer(key) - return super()._convert_scalar_indexer(key, kind=kind) + _default_dtype = np.dtype(np.uint64) @Appender(_index_shared_docs["_convert_arr_indexer"]) def _convert_arr_indexer(self, keyarr): From ebaed91fcb4cc1df93baab03821eec3bfda01fbb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 19 Jan 2020 12:54:35 -0800 Subject: [PATCH 107/158] REF: share code between Int64Index and UInt64Index (#31129) From 4f1afc450700ff1dcc5f8df6cceee82d9aef6f0a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 19 Jan 2020 12:55:17 -0800 Subject: [PATCH 108/158] REF: share code between Int64Index and UInt64Index (#31129) From 9843f437fdd6b66c9227524734a4a8382794a2f7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 19 Jan 2020 12:56:03 -0800 Subject: [PATCH 109/158] REF: share code between Int64Index and UInt64Index (#31129) From 7c94949dc89c62cae1bc647acd87266d6c3a0468 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 19 Jan 2020 12:58:35 -0800 Subject: [PATCH 110/158] TYP: Index get_indexer_foo methods (#31137) --- pandas/_libs/index.pyx | 22 ++++++++++------------ pandas/_libs/index_class_helper.pxi.in | 5 +---- pandas/core/indexes/base.py | 23 ++++++++++++++++------- 3 files changed, 27 insertions(+), 23 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 4bcdb5d96a32d..e5e3b27c41721 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -213,7 +213,8 @@ cdef class IndexEngine: return self.monotonic_dec == 1 cdef inline _do_monotonic_check(self): - cdef object is_unique + cdef: + bint is_unique try: values = self._get_index_values() self.monotonic_inc, self.monotonic_dec, is_unique = \ @@ -236,10 +237,10 @@ cdef class IndexEngine: cdef _call_monotonic(self, values): return algos.is_monotonic(values, timelike=False) - def get_backfill_indexer(self, other, limit=None): + def get_backfill_indexer(self, other: np.ndarray, limit=None) -> np.ndarray: return algos.backfill(self._get_index_values(), other, limit=limit) - def get_pad_indexer(self, other, limit=None): + def get_pad_indexer(self, other: np.ndarray, limit=None) -> np.ndarray: return algos.pad(self._get_index_values(), other, limit=limit) cdef _make_hash_table(self, Py_ssize_t n): @@ -477,13 +478,13 @@ cdef class DatetimeEngine(Int64Engine): values = np.asarray(values).view('i8') return self.mapping.lookup(values) - def get_pad_indexer(self, other, limit=None): + def get_pad_indexer(self, other: np.ndarray, limit=None) -> np.ndarray: if other.dtype != self._get_box_dtype(): return np.repeat(-1, len(other)).astype('i4') other = np.asarray(other).view('i8') return algos.pad(self._get_index_values(), other, limit=limit) - def get_backfill_indexer(self, other, limit=None): + def get_backfill_indexer(self, other: np.ndarray, limit=None) -> np.ndarray: if other.dtype != self._get_box_dtype(): return np.repeat(-1, len(other)).astype('i4') other = np.asarray(other).view('i8') @@ -506,16 +507,13 @@ cdef class PeriodEngine(Int64Engine): cdef _get_index_values(self): return super(PeriodEngine, self).vgetter().view("i8") - cdef void _call_map_locations(self, values): - # super(...) pattern doesn't seem to work with `cdef` - Int64Engine._call_map_locations(self, values.view('i8')) - cdef _call_monotonic(self, values): # super(...) pattern doesn't seem to work with `cdef` return Int64Engine._call_monotonic(self, values.view('i8')) def get_indexer(self, values): - cdef ndarray[int64_t, ndim=1] ordinals + cdef: + ndarray[int64_t, ndim=1] ordinals super(PeriodEngine, self)._ensure_mapping_populated() @@ -524,14 +522,14 @@ cdef class PeriodEngine(Int64Engine): return self.mapping.lookup(ordinals) - def get_pad_indexer(self, other, limit=None): + def get_pad_indexer(self, other: np.ndarray, limit=None) -> np.ndarray: freq = super(PeriodEngine, self).vgetter().freq ordinal = periodlib.extract_ordinals(other, freq) return algos.pad(self._get_index_values(), np.asarray(ordinal), limit=limit) - def get_backfill_indexer(self, other, limit=None): + def get_backfill_indexer(self, other: np.ndarray, limit=None) -> np.ndarray: freq = super(PeriodEngine, self).vgetter().freq ordinal = periodlib.extract_ordinals(other, freq) diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index 093cca4fe7ed5..cd2b9fbe7d6d6 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -53,10 +53,7 @@ cdef class {{name}}Engine(IndexEngine): ndarray[{{ctype}}] values int count = 0 - {{if name not in {'Float64', 'Float32'} }} - if not util.is_integer_object(val): - raise KeyError(val) - {{endif}} + self._check_type(val) # A view is needed for some subclasses, such as PeriodEngine: values = self._get_index_values().view('{{dtype}}') diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5ce2b06ed7dbd..dc74840958e1f 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -96,6 +96,7 @@ duplicated="np.ndarray", ) _index_shared_docs = dict() +str_t = str def _make_comparison_op(op, cls): @@ -2959,7 +2960,9 @@ def get_loc(self, key, method=None, tolerance=None): """ @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) - def get_indexer(self, target, method=None, limit=None, tolerance=None): + def get_indexer( + self, target, method=None, limit=None, tolerance=None + ) -> np.ndarray: method = missing.clean_reindex_fill_method(method) target = ensure_index(target) if tolerance is not None: @@ -3016,14 +3019,16 @@ def _convert_tolerance(self, tolerance, target): raise ValueError("list-like tolerance size must match target index size") return tolerance - def _get_fill_indexer(self, target, method, limit=None, tolerance=None): + def _get_fill_indexer( + self, target: "Index", method: str_t, limit=None, tolerance=None + ) -> np.ndarray: if self.is_monotonic_increasing and target.is_monotonic_increasing: - method = ( + engine_method = ( self._engine.get_pad_indexer if method == "pad" else self._engine.get_backfill_indexer ) - indexer = method(target._ndarray_values, limit) + indexer = engine_method(target._ndarray_values, limit) else: indexer = self._get_fill_indexer_searchsorted(target, method, limit) if tolerance is not None: @@ -3032,7 +3037,9 @@ def _get_fill_indexer(self, target, method, limit=None, tolerance=None): ) return indexer - def _get_fill_indexer_searchsorted(self, target, method, limit=None): + def _get_fill_indexer_searchsorted( + self, target: "Index", method: str_t, limit=None + ) -> np.ndarray: """ Fallback pad/backfill get_indexer that works for monotonic decreasing indexes and non-monotonic targets. @@ -3063,7 +3070,7 @@ def _get_fill_indexer_searchsorted(self, target, method, limit=None): indexer[indexer == len(self)] = -1 return indexer - def _get_nearest_indexer(self, target, limit, tolerance): + def _get_nearest_indexer(self, target: "Index", limit, tolerance) -> np.ndarray: """ Get the indexer for the nearest index labels; requires an index with values that can be subtracted from each other (e.g., not strings or @@ -3086,7 +3093,9 @@ def _get_nearest_indexer(self, target, limit, tolerance): indexer = self._filter_indexer_tolerance(target, indexer, tolerance) return indexer - def _filter_indexer_tolerance(self, target, indexer, tolerance): + def _filter_indexer_tolerance( + self, target: "Index", indexer: np.ndarray, tolerance + ) -> np.ndarray: distance = abs(self.values[indexer] - target) indexer = np.where(distance <= tolerance, indexer, -1) return indexer From 54ba6db95010a22229bdd3c3ed249676259eae9f Mon Sep 17 00:00:00 2001 From: kylekeppler Date: Mon, 20 Jan 2020 08:34:11 -0500 Subject: [PATCH 111/158] DOC: Fix HDF5 complevel and complib formatting (#31053) --- doc/source/user_guide/io.rst | 83 +++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 40 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 55bbf6848820b..e776da016d5d7 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -4220,46 +4220,49 @@ Compression all kinds of stores, not just tables. Two parameters are used to control compression: ``complevel`` and ``complib``. -``complevel`` specifies if and how hard data is to be compressed. - ``complevel=0`` and ``complevel=None`` disables - compression and ``0`_: The default compression library. A classic in terms of compression, achieves good compression rates but is somewhat slow. - - `lzo `_: Fast compression and decompression. - - `bzip2 `_: Good compression rates. - - `blosc `_: Fast compression and decompression. - - Support for alternative blosc compressors: - - - `blosc:blosclz `_ This is the - default compressor for ``blosc`` - - `blosc:lz4 - `_: - A compact, very popular and fast compressor. - - `blosc:lz4hc - `_: - A tweaked version of LZ4, produces better - compression ratios at the expense of speed. - - `blosc:snappy `_: - A popular compressor used in many places. - - `blosc:zlib `_: A classic; - somewhat slower than the previous ones, but - achieving better compression ratios. - - `blosc:zstd `_: An - extremely well balanced codec; it provides the best - compression ratios among the others above, and at - reasonably fast speed. - - If ``complib`` is defined as something other than the - listed libraries a ``ValueError`` exception is issued. +* ``complevel`` specifies if and how hard data is to be compressed. + ``complevel=0`` and ``complevel=None`` disables compression and + ``0`_: The default compression library. + A classic in terms of compression, achieves good compression + rates but is somewhat slow. + - `lzo `_: Fast + compression and decompression. + - `bzip2 `_: Good compression rates. + - `blosc `_: Fast compression and + decompression. + + Support for alternative blosc compressors: + + - `blosc:blosclz `_ This is the + default compressor for ``blosc`` + - `blosc:lz4 + `_: + A compact, very popular and fast compressor. + - `blosc:lz4hc + `_: + A tweaked version of LZ4, produces better + compression ratios at the expense of speed. + - `blosc:snappy `_: + A popular compressor used in many places. + - `blosc:zlib `_: A classic; + somewhat slower than the previous ones, but + achieving better compression ratios. + - `blosc:zstd `_: An + extremely well balanced codec; it provides the best + compression ratios among the others above, and at + reasonably fast speed. + + If ``complib`` is defined as something other than the listed libraries a + ``ValueError`` exception is issued. .. note:: From 212b71497b0d87fd47dd38caa5db68161dd57ba4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 20 Jan 2020 05:35:20 -0800 Subject: [PATCH 112/158] DOC: Restore ExtensionIndex.dropna.__doc__ (#31095) --- pandas/core/indexes/extension.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 58fcce7e59be7..db35cdb72979f 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -6,7 +6,7 @@ import numpy as np from pandas.compat.numpy import function as nv -from pandas.util._decorators import cache_readonly +from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ensure_platform_int, is_dtype_equal from pandas.core.dtypes.generic import ABCSeries @@ -188,6 +188,7 @@ def __iter__(self): def _ndarray_values(self) -> np.ndarray: return self._data._ndarray_values + @Appender(Index.dropna.__doc__) def dropna(self, how="any"): if how not in ("any", "all"): raise ValueError(f"invalid how option: {how}") @@ -201,6 +202,7 @@ def repeat(self, repeats, axis=None): result = self._data.repeat(repeats, axis=axis) return self._shallow_copy(result) + @Appender(Index.take.__doc__) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) indices = ensure_platform_int(indices) @@ -230,6 +232,7 @@ def _get_unique_index(self, dropna=False): result = result[~result.isna()] return self._shallow_copy(result) + @Appender(Index.astype.__doc__) def astype(self, dtype, copy=True): if is_dtype_equal(self.dtype, dtype) and copy is False: # Ensure that self.astype(self.dtype) is self From eb492a260dae2f25d1b780876c0fd563e8b6005b Mon Sep 17 00:00:00 2001 From: rebecca-palmer Date: Mon, 20 Jan 2020 15:16:41 +0000 Subject: [PATCH 113/158] Use Python 3 shebangs (#31147) --- ci/print_skipped.py | 2 +- doc/make.py | 2 +- doc/sphinxext/announce.py | 2 +- pandas/core/computation/eval.py | 2 +- pandas/tests/io/generate_legacy_storage_files.py | 2 +- pandas/tests/plotting/common.py | 2 +- scripts/find_commits_touching_func.py | 2 +- scripts/generate_pip_deps_from_conda.py | 2 +- scripts/validate_docstrings.py | 2 +- scripts/validate_string_concatenation.py | 2 +- setup.py | 2 +- web/pandas_web.py | 2 +- 12 files changed, 12 insertions(+), 12 deletions(-) diff --git a/ci/print_skipped.py b/ci/print_skipped.py index 72822fa2d3c7f..60e2f047235e6 100755 --- a/ci/print_skipped.py +++ b/ci/print_skipped.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import os import xml.etree.ElementTree as et diff --git a/doc/make.py b/doc/make.py index cf73f44b5dd02..024a748cd28ca 100755 --- a/doc/make.py +++ b/doc/make.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Python script for building documentation. diff --git a/doc/sphinxext/announce.py b/doc/sphinxext/announce.py index fdc5a6b283ba8..f394aac5c545b 100755 --- a/doc/sphinxext/announce.py +++ b/doc/sphinxext/announce.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- encoding:utf-8 -*- """ Script to generate contributor and pull request lists diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 51892b8c02d87..71e1b6c2a08a9 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Top level ``eval`` module. diff --git a/pandas/tests/io/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py index 6ef0e0457e2e2..67b767a337a89 100755 --- a/pandas/tests/io/generate_legacy_storage_files.py +++ b/pandas/tests/io/generate_legacy_storage_files.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ self-contained to write legacy storage pickle files diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 9f43027836eb4..a604d90acc854 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # coding: utf-8 import os diff --git a/scripts/find_commits_touching_func.py b/scripts/find_commits_touching_func.py index 5e1a169dbfc3f..85675cb6df42b 100755 --- a/scripts/find_commits_touching_func.py +++ b/scripts/find_commits_touching_func.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # copyright 2013, y-p @ github """ Search the git history for all commits touching a named method diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index 9e0ec4df02edf..b0a06416ce443 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Convert the conda environment.yml to the pip requirements-dev.txt, or check that they have the same packages (for the CI) diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 079e9a16cfd13..d43086756769a 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Analyze docstrings to detect errors. diff --git a/scripts/validate_string_concatenation.py b/scripts/validate_string_concatenation.py index 3feeddaabe8d2..fbf3bb5cfccf2 100755 --- a/scripts/validate_string_concatenation.py +++ b/scripts/validate_string_concatenation.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ GH #30454 diff --git a/setup.py b/setup.py index 86fe62202c643..191fe49d1eb89 100755 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Parts of this file were taken from the pyzmq project diff --git a/web/pandas_web.py b/web/pandas_web.py index 45dafcf0c4c10..a34a31feabce0 100755 --- a/web/pandas_web.py +++ b/web/pandas_web.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Simple static site generator for the pandas web. From 71b683305ae0267d05b51711567ea2744c6023de Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 20 Jan 2020 07:32:58 -0800 Subject: [PATCH 114/158] REF/BUG: Index.get_value called incorrectly, de-duplicate+simplify (#31134) --- pandas/core/indexes/base.py | 59 ++++++++----------- pandas/core/indexes/datetimes.py | 5 +- pandas/core/indexes/timedeltas.py | 4 +- .../tests/indexes/datetimes/test_indexing.py | 12 ++-- pandas/tests/indexes/test_base.py | 7 ++- 5 files changed, 43 insertions(+), 44 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index dc74840958e1f..36769ccd7d5b3 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -68,7 +68,6 @@ from pandas.core.arrays import ExtensionArray from pandas.core.base import IndexOpsMixin, PandasObject import pandas.core.common as com -from pandas.core.construction import extract_array from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.frozen import FrozenList import pandas.core.missing as missing @@ -4627,48 +4626,40 @@ def get_value(self, series, key): # would convert to numpy arrays and raise later any way) - GH29926 raise InvalidIndexError(key) - # if we have something that is Index-like, then - # use this, e.g. DatetimeIndex - # Things like `Series._get_value` (via .at) pass the EA directly here. - s = extract_array(series, extract_numpy=True) - if isinstance(s, ExtensionArray): + try: # GH 20882, 21257 # First try to convert the key to a location # If that fails, raise a KeyError if an integer # index, otherwise, see if key is an integer, and # try that - try: - iloc = self.get_loc(key) - return s[iloc] - except KeyError: - if len(self) > 0 and (self.holds_integer() or self.is_boolean()): - raise - elif is_integer(key): - return s[key] - - k = self._convert_scalar_indexer(key, kind="getitem") - try: - loc = self._engine.get_loc(k) - - except KeyError as e1: + loc = self._engine.get_loc(key) + except KeyError: if len(self) > 0 and (self.holds_integer() or self.is_boolean()): raise - - try: - return libindex.get_value_at(s, key) - except IndexError: + elif is_integer(key): + # If the Index cannot hold integer, then this is unambiguously + # a locational lookup. + loc = key + else: raise - except Exception: - raise e1 - except TypeError: - # e.g. "[False] is an invalid key" - raise IndexError(key) - else: - if is_scalar(loc): - tz = getattr(series.dtype, "tz", None) - return libindex.get_value_at(s, loc, tz=tz) - return series.iloc[loc] + return self._get_values_for_loc(series, loc) + + def _get_values_for_loc(self, series, loc): + """ + Do a positional lookup on the given Series, returning either a scalar + or a Series. + + Assumes that `series.index is self` + """ + if is_integer(loc): + if isinstance(series._values, np.ndarray): + # Since we have an ndarray and not DatetimeArray, we dont + # have to worry about a tz. + return libindex.get_value_at(series._values, loc, tz=None) + return series._values[loc] + + return series.iloc[loc] def set_value(self, arr, key, value): """ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 91b9aa63c6a8e..53e3cc436d513 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -667,9 +667,8 @@ def get_value(self, series, key): return com.maybe_box(self, value, series, key) def get_value_maybe_box(self, series, key): - key = self._maybe_cast_for_get_loc(key) - values = self._engine.get_value(com.values_from_object(series), key, tz=self.tz) - return com.maybe_box(self, values, series, key) + loc = self.get_loc(key) + return self._get_values_for_loc(series, loc) def get_loc(self, key, method=None, tolerance=None): """ diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index e7427438828a8..45f98eaf34e40 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -259,8 +259,8 @@ def get_value(self, series, key): return com.maybe_box(self, value, series, key) def get_value_maybe_box(self, series, key: Timedelta): - values = self._engine.get_value(com.values_from_object(series), key) - return com.maybe_box(self, values, series, key) + loc = self.get_loc(key) + return self._get_values_for_loc(series, loc) def get_loc(self, key, method=None, tolerance=None): """ diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index 4c600e510790a..f3c255d50aba1 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -621,17 +621,21 @@ def test_get_value(self): # specifically make sure we have test for np.datetime64 key dti = pd.date_range("2016-01-01", periods=3) - arr = np.arange(6, 8) + arr = np.arange(6, 9) + ser = pd.Series(arr, index=dti) key = dti[1] - result = dti.get_value(arr, key) + with pytest.raises(AttributeError, match="has no attribute '_values'"): + dti.get_value(arr, key) + + result = dti.get_value(ser, key) assert result == 7 - result = dti.get_value(arr, key.to_pydatetime()) + result = dti.get_value(ser, key.to_pydatetime()) assert result == 7 - result = dti.get_value(arr, key.to_datetime64()) + result = dti.get_value(ser, key.to_datetime64()) assert result == 7 def test_get_loc(self): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 1047c457d6b82..d40a2257771a2 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1915,7 +1915,12 @@ def test_get_value(self, index): values = np.random.randn(100) value = index[67] - tm.assert_almost_equal(index.get_value(values, value), values[67]) + with pytest.raises(AttributeError, match="has no attribute '_values'"): + # Index.get_value requires a Series, not an ndarray + index.get_value(values, value) + + result = index.get_value(Series(values, index=values), value) + tm.assert_almost_equal(result, values[67]) @pytest.mark.parametrize("values", [["foo", "bar", "quux"], {"foo", "bar", "quux"}]) @pytest.mark.parametrize( From 6a2f95b320041d166d194dda90d5bc281563eecf Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 20 Jan 2020 07:35:41 -0800 Subject: [PATCH 115/158] REF: require PeriodArray in PeriodIndex._simple_new (#31128) --- pandas/core/indexes/datetimelike.py | 28 +++++++++---------- pandas/core/indexes/period.py | 20 ++++--------- pandas/tests/arrays/test_datetimelike.py | 15 +++++----- .../tests/indexes/period/test_constructors.py | 21 ++++++++++---- 4 files changed, 44 insertions(+), 40 deletions(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index bf1272b223f70..d262fcdc92ebf 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -193,20 +193,21 @@ def sort_values(self, return_indexer=False, ascending=True): # because the treatment of NaT has been changed to put NaT last # instead of first. sorted_values = np.sort(self.asi8) - attribs = self._get_attributes_dict() - freq = attribs["freq"] + freq = self.freq if freq is not None and not is_period_dtype(self): if freq.n > 0 and not ascending: freq = freq * -1 elif freq.n < 0 and ascending: freq = freq * -1 - attribs["freq"] = freq if not ascending: sorted_values = sorted_values[::-1] - return self._simple_new(sorted_values, **attribs) + arr = type(self._data)._simple_new( + sorted_values, dtype=self.dtype, freq=freq + ) + return self._simple_new(arr, name=self.name) @Appender(_index_shared_docs["take"] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): @@ -503,22 +504,21 @@ def _concat_same_dtype(self, to_concat, name): """ Concatenate to_concat which has the same class. """ - attribs = self._get_attributes_dict() - attribs["name"] = name + # do not pass tz to set because tzlocal cannot be hashed if len({str(x.dtype) for x in to_concat}) != 1: raise ValueError("to_concat must have the same tz") - new_data = type(self._values)._concat_same_type(to_concat).asi8 + new_data = type(self._data)._concat_same_type(to_concat) - # GH 3232: If the concat result is evenly spaced, we can retain the - # original frequency - is_diff_evenly_spaced = len(unique_deltas(new_data)) == 1 - if not is_period_dtype(self) and not is_diff_evenly_spaced: - # reset freq - attribs["freq"] = None + if not is_period_dtype(self.dtype): + # GH 3232: If the concat result is evenly spaced, we can retain the + # original frequency + is_diff_evenly_spaced = len(unique_deltas(new_data.asi8)) == 1 + if is_diff_evenly_spaced: + new_data._freq = self.freq - return self._simple_new(new_data, **attribs) + return self._simple_new(new_data, name=name) def shift(self, periods=1, freq=None): """ diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 35f96e61704f0..20e390f2dc7d9 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -15,7 +15,6 @@ is_datetime64_any_dtype, is_dtype_equal, is_float, - is_float_dtype, is_integer, is_integer_dtype, is_list_like, @@ -234,21 +233,12 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs): Parameters ---------- - values : PeriodArray, PeriodIndex, Index[int64], ndarray[int64] + values : PeriodArray Values that can be converted to a PeriodArray without inference or coercion. - """ - # TODO: raising on floats is tested, but maybe not useful. - # Should the callers know not to pass floats? - # At the very least, I think we can ensure that lists aren't passed. - if isinstance(values, list): - values = np.asarray(values) - if is_float_dtype(values): - raise TypeError("PeriodIndex._simple_new does not accept floats.") - if freq: - freq = Period._maybe_convert_freq(freq) - values = PeriodArray(values, freq=freq) + assert isinstance(values, PeriodArray), type(values) + assert freq is None or freq == values.freq, (freq, values.freq) result = object.__new__(cls) result._data = values @@ -834,7 +824,9 @@ def _union(self, other, sort): def _apply_meta(self, rawarr): if not isinstance(rawarr, PeriodIndex): - rawarr = PeriodIndex._simple_new(rawarr, freq=self.freq, name=self.name) + if not isinstance(rawarr, PeriodArray): + rawarr = PeriodArray(rawarr, freq=self.freq) + rawarr = PeriodIndex._simple_new(rawarr, name=self.name) return rawarr def memory_usage(self, deep=False): diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index fa45db93c6102..87b825c8c27bd 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -65,8 +65,8 @@ def test_compare_len1_raises(self): # to the case where one has length-1, which numpy would broadcast data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 - idx = self.index_cls._simple_new(data, freq="D") - arr = self.array_cls(idx) + idx = self.array_cls._simple_new(data, freq="D") + arr = self.index_cls(idx) with pytest.raises(ValueError, match="Lengths must match"): arr == arr[:1] @@ -79,8 +79,8 @@ def test_take(self): data = np.arange(100, dtype="i8") * 24 * 3600 * 10 ** 9 np.random.shuffle(data) - idx = self.index_cls._simple_new(data, freq="D") - arr = self.array_cls(idx) + arr = self.array_cls._simple_new(data, freq="D") + idx = self.index_cls._simple_new(arr) takers = [1, 4, 94] result = arr.take(takers) @@ -97,8 +97,7 @@ def test_take(self): def test_take_fill(self): data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 - idx = self.index_cls._simple_new(data, freq="D") - arr = self.array_cls(idx) + arr = self.array_cls._simple_new(data, freq="D") result = arr.take([-1, 1], allow_fill=True, fill_value=None) assert result[0] is pd.NaT @@ -121,7 +120,9 @@ def test_take_fill(self): def test_concat_same_type(self): data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 - idx = self.index_cls._simple_new(data, freq="D").insert(0, pd.NaT) + arr = self.array_cls._simple_new(data, freq="D") + idx = self.index_cls(arr) + idx = idx.insert(0, pd.NaT) arr = self.array_cls(idx) result = arr._concat_same_type([arr[:-1], arr[1:], arr]) diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index 27ee915e48e5c..dcd3c8e946e9a 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -322,22 +322,33 @@ def test_constructor_mixed(self): def test_constructor_simple_new(self): idx = period_range("2007-01", name="p", periods=2, freq="M") - result = idx._simple_new(idx, name="p", freq=idx.freq) + + with pytest.raises(AssertionError, match=""): + idx._simple_new(idx, name="p", freq=idx.freq) + + result = idx._simple_new(idx._data, name="p", freq=idx.freq) tm.assert_index_equal(result, idx) - result = idx._simple_new(idx.astype("i8"), name="p", freq=idx.freq) + with pytest.raises(AssertionError): + # Need ndarray, not Int64Index + type(idx._data)._simple_new(idx.astype("i8"), freq=idx.freq) + + arr = type(idx._data)._simple_new(idx.asi8, freq=idx.freq) + result = idx._simple_new(arr, name="p") tm.assert_index_equal(result, idx) def test_constructor_simple_new_empty(self): # GH13079 idx = PeriodIndex([], freq="M", name="p") - result = idx._simple_new(idx, name="p", freq="M") + with pytest.raises(AssertionError, match=""): + idx._simple_new(idx, name="p", freq="M") + + result = idx._simple_new(idx._data, name="p", freq="M") tm.assert_index_equal(result, idx) @pytest.mark.parametrize("floats", [[1.1, 2.1], np.array([1.1, 2.1])]) def test_constructor_floats(self, floats): - msg = r"PeriodIndex\._simple_new does not accept floats" - with pytest.raises(TypeError, match=msg): + with pytest.raises(AssertionError, match=" Date: Tue, 21 Jan 2020 00:28:00 +0800 Subject: [PATCH 116/158] BUG: groupby apply raises ValueError when groupby axis has duplicates and applied identity function (#30679) --- doc/source/whatsnew/v1.1.0.rst | 4 +--- pandas/core/groupby/groupby.py | 25 ++++++++++--------------- pandas/tests/groupby/test_apply.py | 23 +++++++++++++++++++++++ 3 files changed, 34 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 1cd325dad9f07..40c02eb495f67 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -133,9 +133,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- -- - +- Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 233bdd11b372b..a8c96840ff17b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -969,22 +969,17 @@ def reset_identity(values): result = concat(values, axis=self.axis) ax = self._selected_obj._get_axis(self.axis) - if isinstance(result, Series): - result = result.reindex(ax) + # this is a very unfortunate situation + # we can't use reindex to restore the original order + # when the ax has duplicates + # so we resort to this + # GH 14776, 30667 + if ax.has_duplicates: + indexer, _ = result.index.get_indexer_non_unique(ax.values) + indexer = algorithms.unique1d(indexer) + result = result.take(indexer, axis=self.axis) else: - - # this is a very unfortunate situation - # we have a multi-index that is NOT lexsorted - # and we have a result which is duplicated - # we can't reindex, so we resort to this - # GH 14776 - if isinstance(ax, MultiIndex) and not ax.is_unique: - indexer = algorithms.unique1d( - result.index.get_indexer_for(ax.values) - ) - result = result.take(indexer, axis=self.axis) - else: - result = result.reindex(ax, axis=self.axis) + result = result.reindex(ax, axis=self.axis) elif self.group_keys: diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 2f2f97f2cd993..e81ff37510dc0 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -467,6 +467,29 @@ def filt2(x): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("test_series", [True, False]) +def test_apply_with_duplicated_non_sorted_axis(test_series): + # GH 30667 + df = pd.DataFrame( + [["x", "p"], ["x", "p"], ["x", "o"]], columns=["X", "Y"], index=[1, 2, 2] + ) + if test_series: + ser = df.set_index("Y")["X"] + result = ser.groupby(level=0).apply(lambda x: x) + + # not expecting the order to remain the same for duplicated axis + result = result.sort_index() + expected = ser.sort_index() + tm.assert_series_equal(result, expected) + else: + result = df.groupby("Y").apply(lambda x: x) + + # not expecting the order to remain the same for duplicated axis + result = result.sort_values("Y") + expected = df.sort_values("Y") + tm.assert_frame_equal(result, expected) + + def test_apply_corner_cases(): # #535, can't use sliding iterator From f4de727ea5b1e54526afd03b75e5e14054b13cdf Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Mon, 20 Jan 2020 18:30:43 +0200 Subject: [PATCH 117/158] TST: insert 'match' to bare pytest raises in pandas/tests/internals/ (#30998) --- pandas/tests/internals/test_internals.py | 33 ++++++++++++++++++------ 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 9c1442b75fbb2..aa966caa63238 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -297,7 +297,8 @@ def test_delete(self): assert (newb.values[1] == 1).all() newb = self.fblock.copy() - with pytest.raises(Exception): + + with pytest.raises(IndexError, match=None): newb.delete(3) @@ -321,7 +322,12 @@ def test_can_hold_element(self): val = date(2010, 10, 10) assert not block._can_hold_element(val) - with pytest.raises(TypeError): + + msg = ( + "'value' should be a 'Timestamp', 'NaT', " + "or array of those. Got 'date' instead." + ) + with pytest.raises(TypeError, match=msg): arr[0] = val @@ -350,7 +356,10 @@ def test_duplicate_ref_loc_failure(self): blocks[1].mgr_locs = np.array([0]) # test trying to create block manager with overlapping ref locs - with pytest.raises(AssertionError): + + msg = "Gaps in blk ref_locs" + + with pytest.raises(AssertionError, match=msg): BlockManager(blocks, axes) blocks[0].mgr_locs = np.array([0]) @@ -808,7 +817,11 @@ def test_validate_bool_args(self): bm1 = create_mgr("a,b,c: i8-1; d,e,f: i8-2") for value in invalid_values: - with pytest.raises(ValueError): + msg = ( + 'For argument "inplace" expected type bool, ' + f"received type {type(value).__name__}." + ) + with pytest.raises(ValueError, match=msg): bm1.replace_list([1], [2], inplace=value) @@ -1027,9 +1040,11 @@ def test_slice_len(self): assert len(BlockPlacement(slice(1, 0, -1))) == 1 def test_zero_step_raises(self): - with pytest.raises(ValueError): + msg = "slice step cannot be zero" + + with pytest.raises(ValueError, match=msg): BlockPlacement(slice(1, 1, 0)) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): BlockPlacement(slice(1, 2, 0)) def test_unbounded_slice_raises(self): @@ -1132,9 +1147,11 @@ def assert_add_equals(val, inc, result): assert_add_equals(slice(1, 4), -1, [0, 1, 2]) assert_add_equals([1, 2, 4], -1, [0, 1, 3]) - with pytest.raises(ValueError): + msg = "iadd causes length change" + + with pytest.raises(ValueError, match=msg): BlockPlacement(slice(1, 4)).add(-10) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): BlockPlacement([1, 2, 4]).add(-10) From f8e905e97e3b55938090312fb9fcc4e67f31528a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 20 Jan 2020 08:39:55 -0800 Subject: [PATCH 118/158] REF: share searchsorted between DTI/TDI/PI, insert between DTI/TDI (#31143) --- pandas/core/indexes/datetimelike.py | 69 ++++++++++++++++++++++++++- pandas/core/indexes/datetimes.py | 72 +--------------------------- pandas/core/indexes/period.py | 8 +--- pandas/core/indexes/timedeltas.py | 74 +---------------------------- 4 files changed, 73 insertions(+), 150 deletions(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index d262fcdc92ebf..7eaf03020e67a 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -27,12 +27,13 @@ ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna from pandas.core import algorithms from pandas.core.accessor import PandasDelegate from pandas.core.arrays import DatetimeArray, ExtensionArray, TimedeltaArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin +from pandas.core.base import _shared_docs import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.extension import ( @@ -222,6 +223,18 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): self, indices, axis, allow_fill, fill_value, **kwargs ) + @Appender(_shared_docs["searchsorted"]) + def searchsorted(self, value, side="left", sorter=None): + if isinstance(value, str): + raise TypeError( + "searchsorted requires compatible dtype or scalar, " + f"not {type(value).__name__}" + ) + if isinstance(value, Index): + value = value._data + + return self._data.searchsorted(value, side=side, sorter=sorter) + _can_hold_na = True _na_value = NaT @@ -883,6 +896,60 @@ def _wrap_joined_index(self, joined, other): kwargs["tz"] = getattr(other, "tz", None) return self._simple_new(joined, name, **kwargs) + # -------------------------------------------------------------------- + # List-Like Methods + + def insert(self, loc, item): + """ + Make new Index inserting new item at location + Parameters + ---------- + loc : int + item : object + if not either a Python datetime or a numpy integer-like, returned + Index dtype will be object rather than datetime. + Returns + ------- + new_index : Index + """ + if isinstance(item, self._data._recognized_scalars): + item = self._data._scalar_type(item) + elif is_valid_nat_for_dtype(item, self.dtype): + # GH 18295 + item = self._na_value + elif is_scalar(item) and isna(item): + raise TypeError( + f"cannot insert {type(self).__name__} with incompatible label" + ) + + freq = None + if isinstance(item, self._data._scalar_type) or item is NaT: + self._data._check_compatible_with(item, setitem=True) + + # check freq can be preserved on edge cases + if self.size and self.freq is not None: + if item is NaT: + pass + elif (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: + freq = self.freq + elif (loc == len(self)) and item - self.freq == self[-1]: + freq = self.freq + item = item.asm8 + + try: + new_i8s = np.concatenate( + (self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8) + ) + return self._shallow_copy(new_i8s, freq=freq) + except (AttributeError, TypeError): + + # fall back to object index + if isinstance(item, str): + return self.astype(object).insert(loc, item) + raise TypeError( + f"cannot insert {type(self).__name__} with incompatible label" + ) + class DatetimelikeDelegateMixin(PandasDelegate): """ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 53e3cc436d513..ee9b948a76ac8 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -14,11 +14,11 @@ tslib as libts, ) from pandas._libs.tslibs import ccalendar, fields, parsing, timezones -from pandas.util._decorators import Appender, Substitution, cache_readonly +from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import _NS_DTYPE, is_float, is_integer, is_scalar from pandas.core.dtypes.dtypes import DatetimeTZDtype -from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna +from pandas.core.dtypes.missing import isna from pandas.core.accessor import delegate_names from pandas.core.arrays.datetimes import ( @@ -26,7 +26,6 @@ tz_to_dtype, validate_tz_from_dtype, ) -from pandas.core.base import _shared_docs import pandas.core.common as com from pandas.core.indexes.base import Index, maybe_extract_name from pandas.core.indexes.datetimelike import ( @@ -825,19 +824,6 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): # -------------------------------------------------------------------- - @Substitution(klass="DatetimeIndex") - @Appender(_shared_docs["searchsorted"]) - def searchsorted(self, value, side="left", sorter=None): - if isinstance(value, str): - raise TypeError( - "searchsorted requires compatible dtype or scalar, " - f"not {type(value).__name__}" - ) - if isinstance(value, Index): - value = value._data - - return self._data.searchsorted(value, side=side) - def is_type_compatible(self, typ) -> bool: return typ == self.inferred_type or typ == "datetime" @@ -847,60 +833,6 @@ def inferred_type(self) -> str: # sure we can't have ambiguous indexing return "datetime64" - def insert(self, loc, item): - """ - Make new Index inserting new item at location - - Parameters - ---------- - loc : int - item : object - if not either a Python datetime or a numpy integer-like, returned - Index dtype will be object rather than datetime. - - Returns - ------- - new_index : Index - """ - if isinstance(item, self._data._recognized_scalars): - item = self._data._scalar_type(item) - elif is_valid_nat_for_dtype(item, self.dtype): - # GH 18295 - item = self._na_value - elif is_scalar(item) and isna(item): - # i.e. timedeltat64("NaT") - raise TypeError( - f"cannot insert {type(self).__name__} with incompatible label" - ) - - freq = None - if isinstance(item, self._data._scalar_type) or item is NaT: - self._data._check_compatible_with(item, setitem=True) - - # check freq can be preserved on edge cases - if self.size and self.freq is not None: - if item is NaT: - pass - elif (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: - freq = self.freq - elif (loc == len(self)) and item - self.freq == self[-1]: - freq = self.freq - item = item.asm8 - - try: - new_i8s = np.concatenate( - (self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8) - ) - return self._shallow_copy(new_i8s, freq=freq) - except (AttributeError, TypeError): - - # fall back to object index - if isinstance(item, str): - return self.astype(object).insert(loc, item) - raise TypeError( - f"cannot insert {type(self).__name__} with incompatible label" - ) - def indexer_at_time(self, time, asof=False): """ Return index locations of index values at particular time of day diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 20e390f2dc7d9..8fa24241e7ad1 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -7,7 +7,7 @@ from pandas._libs.tslibs import NaT, frequencies as libfrequencies, resolution from pandas._libs.tslibs.parsing import parse_time_string from pandas._libs.tslibs.period import Period -from pandas.util._decorators import Appender, Substitution, cache_readonly +from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( ensure_platform_int, @@ -29,7 +29,6 @@ raise_on_incompatible, validate_dtype_freq, ) -from pandas.core.base import _shared_docs import pandas.core.common as com import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( @@ -455,11 +454,6 @@ def astype(self, dtype, copy=True, how="start"): # TODO: should probably raise on `how` here, so we don't ignore it. return super().astype(dtype, copy=copy) - @Substitution(klass="PeriodIndex") - @Appender(_shared_docs["searchsorted"]) - def searchsorted(self, value, side="left", sorter=None): - return self._data.searchsorted(value, side=side, sorter=sorter) - @property def is_full(self) -> bool: """ diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 45f98eaf34e40..007714a1c19a2 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -4,7 +4,7 @@ import numpy as np from pandas._libs import NaT, Timedelta, index as libindex -from pandas.util._decorators import Appender, Substitution +from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( _TD_DTYPE, @@ -16,12 +16,11 @@ is_timedelta64_ns_dtype, pandas_dtype, ) -from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna +from pandas.core.dtypes.missing import isna from pandas.core.accessor import delegate_names from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays.timedeltas import TimedeltaArray, _is_convertible_to_td -from pandas.core.base import _shared_docs import pandas.core.common as com from pandas.core.indexes.base import Index, _index_shared_docs, maybe_extract_name from pandas.core.indexes.datetimelike import ( @@ -345,19 +344,6 @@ def _partial_td_slice(self, key): raise NotImplementedError - @Substitution(klass="TimedeltaIndex") - @Appender(_shared_docs["searchsorted"]) - def searchsorted(self, value, side="left", sorter=None): - if isinstance(value, str): - raise TypeError( - "searchsorted requires compatible dtype or scalar, " - f"not {type(value).__name__}" - ) - if isinstance(value, Index): - value = value._data - - return self._data.searchsorted(value, side=side, sorter=sorter) - def is_type_compatible(self, typ) -> bool: return typ == self.inferred_type or typ == "timedelta" @@ -365,62 +351,6 @@ def is_type_compatible(self, typ) -> bool: def inferred_type(self) -> str: return "timedelta64" - def insert(self, loc, item): - """ - Make new Index inserting new item at location - - Parameters - ---------- - loc : int - item : object - If not either a Python datetime or a numpy integer-like, returned - Index dtype will be object rather than datetime. - - Returns - ------- - new_index : Index - """ - # try to convert if possible - if isinstance(item, self._data._recognized_scalars): - item = self._data._scalar_type(item) - elif is_valid_nat_for_dtype(item, self.dtype): - # GH 18295 - item = self._na_value - elif is_scalar(item) and isna(item): - # i.e. datetime64("NaT") - raise TypeError( - f"cannot insert {type(self).__name__} with incompatible label" - ) - - freq = None - if isinstance(item, self._data._scalar_type) or item is NaT: - self._data._check_compatible_with(item, setitem=True) - - # check freq can be preserved on edge cases - if self.size and self.freq is not None: - if item is NaT: - pass - elif (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: - freq = self.freq - elif (loc == len(self)) and item - self.freq == self[-1]: - freq = self.freq - item = item.asm8 - - try: - new_i8s = np.concatenate( - (self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8) - ) - tda = type(self._data)._simple_new(new_i8s, freq=freq) - return self._shallow_copy(tda) - except (AttributeError, TypeError): - - # fall back to object index - if isinstance(item, str): - return self.astype(object).insert(loc, item) - raise TypeError( - f"cannot insert {type(self).__name__} with incompatible label" - ) - TimedeltaIndex._add_logical_methods_disabled() From 264363edab2b617638b9b1d7d5db70fad372a84c Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Mon, 20 Jan 2020 19:17:45 +0200 Subject: [PATCH 119/158] TYP: Annotations in pandas/core/nanops.py (#30461) --- pandas/core/nanops.py | 137 ++++++++++++++++++++++++++++++++---------- 1 file changed, 106 insertions(+), 31 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 6b03e76a1d691..2bf2be082f639 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -8,6 +8,7 @@ from pandas._config import get_option from pandas._libs import NaT, Timedelta, Timestamp, iNaT, lib +from pandas._typing import Dtype, Scalar from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask @@ -37,7 +38,7 @@ _USE_BOTTLENECK = False -def set_use_bottleneck(v=True): +def set_use_bottleneck(v: bool = True) -> None: # set/unset to use bottleneck global _USE_BOTTLENECK if _BOTTLENECK_INSTALLED: @@ -93,7 +94,9 @@ def __call__(self, alt): bn_func = None @functools.wraps(alt) - def f(values, axis=None, skipna=True, **kwds): + def f( + values: np.ndarray, axis: Optional[int] = None, skipna: bool = True, **kwds + ): if len(self.kwargs) > 0: for k, v in self.kwargs.items(): if k not in kwds: @@ -129,10 +132,10 @@ def f(values, axis=None, skipna=True, **kwds): return f -def _bn_ok_dtype(dt, name: str) -> bool: +def _bn_ok_dtype(dtype: Dtype, name: str) -> bool: # Bottleneck chokes on datetime64 - if not is_object_dtype(dt) and not ( - is_datetime_or_timedelta_dtype(dt) or is_datetime64tz_dtype(dt) + if not is_object_dtype(dtype) and not ( + is_datetime_or_timedelta_dtype(dtype) or is_datetime64tz_dtype(dtype) ): # GH 15507 @@ -163,7 +166,9 @@ def _has_infs(result) -> bool: return False -def _get_fill_value(dtype, fill_value=None, fill_value_typ=None): +def _get_fill_value( + dtype: Dtype, fill_value: Optional[Scalar] = None, fill_value_typ=None +): """ return the correct fill value for the dtype of the values """ if fill_value is not None: return fill_value @@ -326,12 +331,12 @@ def _get_values( return values, mask, dtype, dtype_max, fill_value -def _na_ok_dtype(dtype): +def _na_ok_dtype(dtype) -> bool: # TODO: what about datetime64tz? PeriodDtype? return not issubclass(dtype.type, (np.integer, np.timedelta64, np.datetime64)) -def _wrap_results(result, dtype, fill_value=None): +def _wrap_results(result, dtype: Dtype, fill_value=None): """ wrap our results if needed """ if is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): @@ -362,7 +367,9 @@ def _wrap_results(result, dtype, fill_value=None): return result -def _na_for_min_count(values, axis: Optional[int]): +def _na_for_min_count( + values: np.ndarray, axis: Optional[int] +) -> Union[Scalar, np.ndarray]: """ Return the missing value for `values`. @@ -393,7 +400,12 @@ def _na_for_min_count(values, axis: Optional[int]): return result -def nanany(values, axis=None, skipna: bool = True, mask=None): +def nanany( + values: np.ndarray, + axis: Optional[int] = None, + skipna: bool = True, + mask: Optional[np.ndarray] = None, +) -> bool: """ Check if any elements along an axis evaluate to True. @@ -425,7 +437,12 @@ def nanany(values, axis=None, skipna: bool = True, mask=None): return values.any(axis) -def nanall(values, axis=None, skipna: bool = True, mask=None): +def nanall( + values: np.ndarray, + axis: Optional[int] = None, + skipna: bool = True, + mask: Optional[np.ndarray] = None, +) -> bool: """ Check if all elements along an axis evaluate to True. @@ -458,7 +475,13 @@ def nanall(values, axis=None, skipna: bool = True, mask=None): @disallow("M8") -def nansum(values, axis=None, skipna=True, min_count=0, mask=None): +def nansum( + values: np.ndarray, + axis: Optional[int] = None, + skipna: bool = True, + min_count: int = 0, + mask: Optional[np.ndarray] = None, +) -> float: """ Sum the elements along an axis ignoring NaNs @@ -629,7 +652,7 @@ def _get_counts_nanvar( mask: Optional[np.ndarray], axis: Optional[int], ddof: int, - dtype=float, + dtype: Dtype = float, ) -> Tuple[Union[int, np.ndarray], Union[int, np.ndarray]]: """ Get the count of non-null values along an axis, accounting for degrees of freedom. @@ -776,7 +799,13 @@ def nanvar(values, axis=None, skipna=True, ddof=1, mask=None): @disallow("M8", "m8") -def nansem(values, axis=None, skipna=True, ddof=1, mask=None): +def nansem( + values: np.ndarray, + axis: Optional[int] = None, + skipna: bool = True, + ddof: int = 1, + mask: Optional[np.ndarray] = None, +) -> float: """ Compute the standard error in the mean along given axis while ignoring NaNs @@ -821,7 +850,12 @@ def nansem(values, axis=None, skipna=True, ddof=1, mask=None): def _nanminmax(meth, fill_value_typ): @bottleneck_switch(name="nan" + meth) - def reduction(values, axis=None, skipna=True, mask=None): + def reduction( + values: np.ndarray, + axis: Optional[int] = None, + skipna: bool = True, + mask: Optional[np.ndarray] = None, + ) -> Dtype: values, mask, dtype, dtype_max, fill_value = _get_values( values, skipna, fill_value_typ=fill_value_typ, mask=mask @@ -847,7 +881,12 @@ def reduction(values, axis=None, skipna=True, mask=None): @disallow("O") -def nanargmax(values, axis=None, skipna=True, mask=None): +def nanargmax( + values: np.ndarray, + axis: Optional[int] = None, + skipna: bool = True, + mask: Optional[np.ndarray] = None, +) -> int: """ Parameters ---------- @@ -878,7 +917,12 @@ def nanargmax(values, axis=None, skipna=True, mask=None): @disallow("O") -def nanargmin(values, axis=None, skipna=True, mask=None): +def nanargmin( + values: np.ndarray, + axis: Optional[int] = None, + skipna: bool = True, + mask: Optional[np.ndarray] = None, +) -> int: """ Parameters ---------- @@ -909,7 +953,12 @@ def nanargmin(values, axis=None, skipna=True, mask=None): @disallow("M8", "m8") -def nanskew(values, axis=None, skipna=True, mask=None): +def nanskew( + values: np.ndarray, + axis: Optional[int] = None, + skipna: bool = True, + mask: Optional[np.ndarray] = None, +) -> float: """ Compute the sample skewness. The statistic computed here is the adjusted Fisher-Pearson standardized @@ -987,7 +1036,12 @@ def nanskew(values, axis=None, skipna=True, mask=None): @disallow("M8", "m8") -def nankurt(values, axis=None, skipna=True, mask=None): +def nankurt( + values: np.ndarray, + axis: Optional[int] = None, + skipna: bool = True, + mask: Optional[np.ndarray] = None, +) -> float: """ Compute the sample excess kurtosis @@ -1075,7 +1129,13 @@ def nankurt(values, axis=None, skipna=True, mask=None): @disallow("M8", "m8") -def nanprod(values, axis=None, skipna=True, min_count=0, mask=None): +def nanprod( + values: np.ndarray, + axis: Optional[int] = None, + skipna: bool = True, + min_count: int = 0, + mask: Optional[np.ndarray] = None, +) -> float: """ Parameters ---------- @@ -1088,7 +1148,8 @@ def nanprod(values, axis=None, skipna=True, min_count=0, mask=None): Returns ------- - result : dtype + Dtype + The product of all elements on a given axis. ( NaNs are treated as 1) Examples -------- @@ -1096,10 +1157,6 @@ def nanprod(values, axis=None, skipna=True, min_count=0, mask=None): >>> s = pd.Series([1, 2, 3, np.nan]) >>> nanops.nanprod(s) 6.0 - - Returns - ------- - The product of all elements on a given axis. ( NaNs are treated as 1) """ mask = _maybe_get_mask(values, skipna, mask) @@ -1138,7 +1195,7 @@ def _get_counts( values_shape: Tuple[int], mask: Optional[np.ndarray], axis: Optional[int], - dtype=float, + dtype: Dtype = float, ) -> Union[int, np.ndarray]: """ Get the count of non-null values along an axis @@ -1184,7 +1241,13 @@ def _maybe_null_out( mask: Optional[np.ndarray], shape: Tuple, min_count: int = 1, -) -> np.ndarray: +) -> float: + """ + Returns + ------- + Dtype + The product of all elements on a given axis. ( NaNs are treated as 1) + """ if mask is not None and axis is not None and getattr(result, "ndim", False): null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0 if np.any(null_mask): @@ -1218,7 +1281,9 @@ def _zero_out_fperr(arg): @disallow("M8", "m8") -def nancorr(a, b, method="pearson", min_periods=None): +def nancorr( + a: np.ndarray, b: np.ndarray, method="pearson", min_periods: Optional[int] = None, +): """ a, b: ndarrays """ @@ -1268,7 +1333,7 @@ def _spearman(a, b): @disallow("M8", "m8") -def nancov(a, b, min_periods=None): +def nancov(a: np.ndarray, b: np.ndarray, min_periods: Optional[int] = None): if len(a) != len(b): raise AssertionError("Operands to nancov must have same size") @@ -1341,7 +1406,9 @@ def f(x, y): nanne = make_nancomp(operator.ne) -def _nanpercentile_1d(values, mask, q, na_value, interpolation): +def _nanpercentile_1d( + values: np.ndarray, mask: np.ndarray, q, na_value: Scalar, interpolation +) -> Union[Scalar, np.ndarray]: """ Wrapper for np.percentile that skips missing values, specialized to 1-dimensional case. @@ -1372,7 +1439,15 @@ def _nanpercentile_1d(values, mask, q, na_value, interpolation): return np.percentile(values, q, interpolation=interpolation) -def nanpercentile(values, q, axis, na_value, mask, ndim, interpolation): +def nanpercentile( + values: np.ndarray, + q, + axis: int, + na_value, + mask: np.ndarray, + ndim: int, + interpolation, +): """ Wrapper for np.percentile that skips missing values. From 06e416db9729517eadf0b5a9b46243789a4f2096 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 20 Jan 2020 09:19:08 -0800 Subject: [PATCH 120/158] BUG: raise on non-hashable in __contains__ (#30902) --- pandas/_libs/index.pyx | 13 +++++++++---- pandas/core/indexes/base.py | 4 ++-- pandas/core/indexes/category.py | 3 ++- pandas/core/indexes/datetimelike.py | 5 +++-- pandas/core/indexes/interval.py | 3 ++- pandas/core/indexes/multi.py | 4 ++-- pandas/core/indexes/numeric.py | 5 +++-- pandas/core/indexes/period.py | 7 ++++--- pandas/core/indexes/range.py | 4 ++-- pandas/tests/indexes/common.py | 8 ++++++++ 10 files changed, 37 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index e5e3b27c41721..e4ec9db560b80 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -72,9 +72,10 @@ cdef class IndexEngine: self.over_size_threshold = n >= _SIZE_CUTOFF self.clear_mapping() - def __contains__(self, object val): + def __contains__(self, val: object) -> bool: + # We assume before we get here: + # - val is hashable self._ensure_mapping_populated() - hash(val) return val in self.mapping cpdef get_value(self, ndarray arr, object key, object tz=None): @@ -415,7 +416,9 @@ cdef class DatetimeEngine(Int64Engine): raise TypeError(scalar) return scalar.value - def __contains__(self, object val): + def __contains__(self, val: object) -> bool: + # We assume before we get here: + # - val is hashable cdef: int64_t loc, conv @@ -712,7 +715,9 @@ cdef class BaseMultiIndexCodesEngine: return indexer - def __contains__(self, object val): + def __contains__(self, val: object) -> bool: + # We assume before we get here: + # - val is hashable # Default __contains__ looks in the underlying mapping, which in this # case only contains integer representations. try: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 36769ccd7d5b3..404c65ea0f1d5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1,7 +1,7 @@ from datetime import datetime import operator from textwrap import dedent -from typing import Dict, FrozenSet, Hashable, Optional, Union +from typing import Any, Dict, FrozenSet, Hashable, Optional, Union import warnings import numpy as np @@ -4144,7 +4144,7 @@ def is_type_compatible(self, kind) -> bool: """ @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) - def __contains__(self, key) -> bool: + def __contains__(self, key: Any) -> bool: hash(key) try: return key in self._engine diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 0ff6469d6b19c..268ab9ba4e4c4 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -385,11 +385,12 @@ def _wrap_setop_result(self, other, result): return self._shallow_copy(result, name=name) @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) - def __contains__(self, key) -> bool: + def __contains__(self, key: Any) -> bool: # if key is a NaN, check if any NaN is in self. if is_scalar(key) and isna(key): return self.hasnans + hash(key) return contains(self, key, container=self._engine) def __array__(self, dtype=None) -> np.ndarray: diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 7eaf03020e67a..1bfec9fbad0ed 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -2,7 +2,7 @@ Base and utility classes for tseries type pandas objects. """ import operator -from typing import List, Optional, Set +from typing import Any, List, Optional, Set import numpy as np @@ -154,7 +154,8 @@ def equals(self, other) -> bool: return np.array_equal(self.asi8, other.asi8) @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) - def __contains__(self, key): + def __contains__(self, key: Any) -> bool: + hash(key) try: res = self.get_loc(key) except (KeyError, TypeError, ValueError): diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 523d6404f5efa..3108c1a1afd0c 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -374,7 +374,7 @@ def _engine(self): right = self._maybe_convert_i8(self.right) return IntervalTree(left, right, closed=self.closed) - def __contains__(self, key) -> bool: + def __contains__(self, key: Any) -> bool: """ return a boolean if this key is IN the index We *only* accept an Interval @@ -387,6 +387,7 @@ def __contains__(self, key) -> bool: ------- bool """ + hash(key) if not isinstance(key, Interval): return False diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 10a2d9f68a7b6..8682af6ab6369 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1,6 +1,6 @@ import datetime from sys import getsizeof -from typing import Hashable, List, Optional, Sequence, Union +from typing import Any, Hashable, List, Optional, Sequence, Union import warnings import numpy as np @@ -973,7 +973,7 @@ def _shallow_copy_with_infer(self, values, **kwargs): return self._shallow_copy(values, **kwargs) @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) - def __contains__(self, key) -> bool: + def __contains__(self, key: Any) -> bool: hash(key) try: self.get_loc(key) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index def77ffbea591..465f21da1278a 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any import numpy as np @@ -461,7 +461,8 @@ def equals(self, other) -> bool: except (TypeError, ValueError): return False - def __contains__(self, other) -> bool: + def __contains__(self, other: Any) -> bool: + hash(other) if super().__contains__(other): return True diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 8fa24241e7ad1..d9d3e934af9f5 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -1,4 +1,5 @@ from datetime import datetime, timedelta +from typing import Any import weakref import numpy as np @@ -358,18 +359,18 @@ def _engine(self): return self._engine_type(period, len(self)) @Appender(_index_shared_docs["contains"]) - def __contains__(self, key) -> bool: + def __contains__(self, key: Any) -> bool: if isinstance(key, Period): if key.freq != self.freq: return False else: return key.ordinal in self._engine else: + hash(key) try: self.get_loc(key) return True - except (TypeError, KeyError): - # TypeError can be reached if we pass a tuple that is not hashable + except KeyError: return False @cache_readonly diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 336f65ca574dc..22940f851ddb0 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1,7 +1,7 @@ from datetime import timedelta import operator from sys import getsizeof -from typing import Optional, Union +from typing import Any, Optional import warnings import numpy as np @@ -332,7 +332,7 @@ def is_monotonic_decreasing(self) -> bool: def has_duplicates(self) -> bool: return False - def __contains__(self, key: Union[int, np.integer]) -> bool: + def __contains__(self, key: Any) -> bool: hash(key) try: key = ensure_python_int(key) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index afc068d6696ef..f3ebe8313d0c6 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -883,3 +883,11 @@ def test_getitem_2d_deprecated(self): res = idx[:, None] assert isinstance(res, np.ndarray), type(res) + + def test_contains_requires_hashable_raises(self): + idx = self.create_index() + with pytest.raises(TypeError, match="unhashable type"): + [] in idx + + with pytest.raises(TypeError): + {} in idx._engine From 998e74278f9cb05bd673997fa59653345322b28a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 20 Jan 2020 09:19:48 -0800 Subject: [PATCH 121/158] BUG: EAs should not be hashable (#30908) --- pandas/core/arrays/base.py | 6 ++++++ pandas/tests/extension/base/methods.py | 5 +++++ 2 files changed, 11 insertions(+) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 9723343ea7af5..c3c91cea43f6b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -176,6 +176,9 @@ class ExtensionArray: types present. See :ref:`extending.extension.ufunc` for more. + + By default, ExtensionArrays are not hashable. Immutable subclasses may + override this behavior. """ # '_typ' is for pandas.core.dtypes.generic.ABCExtensionArray. @@ -1073,6 +1076,9 @@ def _reduce(self, name, skipna=True, **kwargs): """ raise TypeError(f"cannot perform {name} with type {self.dtype}") + def __hash__(self): + raise TypeError(f"unhashable type: {repr(type(self).__name__)}") + class ExtensionOpsMixin: """ diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 1e427c6319cab..6b75176ebd35b 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -261,6 +261,11 @@ def test_shift_fill_value(self, data): expected = data.take([2, 3, 0, 0]) self.assert_extension_array_equal(result, expected) + def test_not_hashable(self, data): + # We are in general mutable, so not hashable + with pytest.raises(TypeError, match="unhashable type"): + hash(data) + def test_hash_pandas_object_works(self, data, as_frame): # https://github.com/pandas-dev/pandas/issues/23066 data = pd.Series(data) From cd7c784aabc81eafa93d681f892a6256def9aa9e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 20 Jan 2020 09:21:24 -0800 Subject: [PATCH 122/158] CLN: fix wrong types getting passed to TDI._get_string_slice (#30874) --- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/period.py | 3 +- pandas/core/indexes/timedeltas.py | 53 ++++++++----------------------- 3 files changed, 16 insertions(+), 42 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 404c65ea0f1d5..c158bdfbac441 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4902,7 +4902,7 @@ def isin(self, values, level=None): self._validate_index_level(level) return algos.isin(self, values) - def _get_string_slice(self, key, use_lhs=True, use_rhs=True): + def _get_string_slice(self, key: str_t, use_lhs: bool = True, use_rhs: bool = True): # this is for partial string indexing, # overridden in DatetimeIndex, TimedeltaIndex and PeriodIndex raise NotImplementedError diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index d9d3e934af9f5..9d501b2601c09 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -684,7 +684,8 @@ def _parsed_string_to_bounds(self, reso, parsed): raise KeyError(reso) return (t1.asfreq(self.freq, how="start"), t1.asfreq(self.freq, how="end")) - def _get_string_slice(self, key): + def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True): + # TODO: Check for non-True use_lhs/use_rhs if not self.is_monotonic: raise ValueError("Partial indexing only valid for ordered time series") diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 007714a1c19a2..1dd5c065ec216 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -237,25 +237,18 @@ def get_value(self, series, key): know what you're doing """ - if _is_convertible_to_td(key): + if isinstance(key, str): + try: + key = Timedelta(key) + except ValueError: + raise KeyError(key) + + if isinstance(key, self._data._recognized_scalars) or key is NaT: key = Timedelta(key) return self.get_value_maybe_box(series, key) - try: - value = Index.get_value(self, series, key) - except KeyError: - try: - loc = self._get_string_slice(key) - return series[loc] - except (TypeError, ValueError, KeyError): - pass - - try: - return self.get_value_maybe_box(series, key) - except (TypeError, ValueError, KeyError): - raise KeyError(key) - else: - return com.maybe_box(self, value, series, key) + value = Index.get_value(self, series, key) + return com.maybe_box(self, value, series, key) def get_value_maybe_box(self, series, key: Timedelta): loc = self.get_loc(key) @@ -288,19 +281,7 @@ def get_loc(self, key, method=None, tolerance=None): key = Timedelta(key) return Index.get_loc(self, key, method, tolerance) - try: - return Index.get_loc(self, key, method, tolerance) - except (KeyError, ValueError, TypeError): - try: - return self._get_string_slice(key) - except (TypeError, KeyError, ValueError): - pass - - try: - stamp = Timedelta(key) - return Index.get_loc(self, stamp, method, tolerance) - except (KeyError, ValueError): - raise KeyError(key) + return Index.get_loc(self, key, method, tolerance) def _maybe_cast_slice_bound(self, label, side, kind): """ @@ -330,18 +311,10 @@ def _maybe_cast_slice_bound(self, label, side, kind): return label - def _get_string_slice(self, key): - if is_integer(key) or is_float(key) or key is NaT: - self._invalid_indexer("slice", key) - loc = self._partial_td_slice(key) - return loc - - def _partial_td_slice(self, key): - + def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True): + # TODO: Check for non-True use_lhs/use_rhs + assert isinstance(key, str), type(key) # given a key, try to figure out a location for a partial slice - if not isinstance(key, str): - return key - raise NotImplementedError def is_type_compatible(self, typ) -> bool: From 059742b9defce006c567e6b3f9b1b9205200f000 Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Mon, 20 Jan 2020 19:31:09 +0200 Subject: [PATCH 123/158] TYP: Type annotations in pandas/io/formats/style.py (#30403) --- pandas/io/formats/excel.py | 13 ++- pandas/io/formats/style.py | 215 +++++++++++++++++++++++-------------- 2 files changed, 142 insertions(+), 86 deletions(-) diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 9b0f100c1b041..b0e8e4033edf2 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -1,14 +1,17 @@ -"""Utilities for conversion to writer-agnostic Excel representation +""" +Utilities for conversion to writer-agnostic Excel representation. """ from functools import reduce import itertools import re -from typing import Callable, Dict, List, Optional, Sequence, Union +from typing import Callable, Dict, Optional, Sequence, Union import warnings import numpy as np +from pandas._typing import Label + from pandas.core.dtypes import missing from pandas.core.dtypes.common import is_float, is_scalar from pandas.core.dtypes.generic import ABCMultiIndex, ABCPeriodIndex @@ -371,10 +374,10 @@ def __init__( df, na_rep: str = "", float_format: Optional[str] = None, - cols: Optional[Sequence] = None, - header: Union[bool, List[str]] = True, + cols: Optional[Sequence[Label]] = None, + header: Union[Sequence[Label], bool] = True, index: bool = True, - index_label: Union[str, Sequence, None] = None, + index_label: Optional[Union[Label, Sequence[Label]]] = None, merge_cells: bool = False, inf_rep: str = "inf", style_converter: Optional[Callable] = None, diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 8570875569e44..4f2430b6c8568 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -1,6 +1,5 @@ """ -Module for applying conditional formatting to -DataFrames and Series. +Module for applying conditional formatting to DataFrames and Series. """ from collections import defaultdict @@ -8,7 +7,17 @@ import copy from functools import partial from itertools import product -from typing import Any, Callable, DefaultDict, Dict, List, Optional, Sequence, Tuple +from typing import ( + Any, + Callable, + DefaultDict, + Dict, + List, + Optional, + Sequence, + Tuple, + Union, +) from uuid import uuid1 import numpy as np @@ -16,6 +25,7 @@ from pandas._config import get_option from pandas._libs import lib +from pandas._typing import Axis, FrameOrSeries, FrameOrSeriesUnion, Label from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import Appender @@ -24,6 +34,7 @@ import pandas as pd from pandas.api.types import is_dict_like, is_list_like import pandas.core.common as com +from pandas.core.frame import DataFrame from pandas.core.generic import _shared_docs from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice @@ -41,7 +52,7 @@ @contextmanager -def _mpl(func): +def _mpl(func: Callable): if has_mpl: yield plt, colors else: @@ -125,13 +136,13 @@ class Styler: def __init__( self, - data, - precision=None, - table_styles=None, - uuid=None, - caption=None, - table_attributes=None, - cell_ids=True, + data: FrameOrSeriesUnion, + precision: Optional[int] = None, + table_styles: Optional[List[Dict[str, List[Tuple[str, str]]]]] = None, + uuid: Optional[str] = None, + caption: Optional[str] = None, + table_attributes: Optional[str] = None, + cell_ids: bool = True, na_rep: Optional[str] = None, ): self.ctx: DefaultDict[Tuple[int, int], List[str]] = defaultdict(list) @@ -175,7 +186,7 @@ def default_display_func(x): Tuple[int, int], Callable[[Any], str] ] = defaultdict(lambda: default_display_func) - def _repr_html_(self): + def _repr_html_(self) -> str: """ Hooks into Jupyter notebook rich display system. """ @@ -196,22 +207,22 @@ def _repr_html_(self): def to_excel( self, excel_writer, - sheet_name="Sheet1", - na_rep="", - float_format=None, - columns=None, - header=True, - index=True, - index_label=None, - startrow=0, - startcol=0, - engine=None, - merge_cells=True, - encoding=None, - inf_rep="inf", - verbose=True, - freeze_panes=None, - ): + sheet_name: str = "Sheet1", + na_rep: str = "", + float_format: Optional[str] = None, + columns: Optional[Sequence[Label]] = None, + header: Union[Sequence[Label], bool] = True, + index: bool = True, + index_label: Optional[Union[Label, Sequence[Label]]] = None, + startrow: int = 0, + startcol: int = 0, + engine: Optional[str] = None, + merge_cells: bool = True, + encoding: Optional[str] = None, + inf_rep: str = "inf", + verbose: bool = True, + freeze_panes: Optional[Tuple[int, int]] = None, + ) -> None: from pandas.io.formats.excel import ExcelFormatter @@ -423,7 +434,7 @@ def format_attr(pair): table_attributes=table_attr, ) - def format(self, formatter, subset=None, na_rep: Optional[str] = None): + def format(self, formatter, subset=None, na_rep: Optional[str] = None) -> "Styler": """ Format the text display value of cells. @@ -496,7 +507,7 @@ def format(self, formatter, subset=None, na_rep: Optional[str] = None): self._display_funcs[(i, j)] = formatter return self - def render(self, **kwargs): + def render(self, **kwargs) -> str: """ Render the built up styles to HTML. @@ -545,16 +556,18 @@ def render(self, **kwargs): d.update(kwargs) return self.template.render(**d) - def _update_ctx(self, attrs): + def _update_ctx(self, attrs: DataFrame) -> None: """ Update the state of the Styler. Collects a mapping of {index_label: [': ']}. - attrs : Series or DataFrame - should contain strings of ': ;: ' - Whitespace shouldn't matter and the final trailing ';' shouldn't - matter. + Parameters + ---------- + attrs : DataFrame + should contain strings of ': ;: ' + Whitespace shouldn't matter and the final trailing ';' shouldn't + matter. """ for row_label, v in attrs.iterrows(): for col_label, col in v.items(): @@ -563,7 +576,7 @@ def _update_ctx(self, attrs): for pair in col.rstrip(";").split(";"): self.ctx[(i, j)].append(pair) - def _copy(self, deepcopy=False): + def _copy(self, deepcopy: bool = False) -> "Styler": styler = Styler( self.data, precision=self.precision, @@ -580,16 +593,16 @@ def _copy(self, deepcopy=False): styler._todo = self._todo return styler - def __copy__(self): + def __copy__(self) -> "Styler": """ Deep copy by default. """ return self._copy(deepcopy=False) - def __deepcopy__(self, memo): + def __deepcopy__(self, memo) -> "Styler": return self._copy(deepcopy=True) - def clear(self): + def clear(self) -> None: """ Reset the styler, removing any previously applied styles. @@ -612,7 +625,13 @@ def _compute(self): r = func(self)(*args, **kwargs) return r - def _apply(self, func, axis=0, subset=None, **kwargs): + def _apply( + self, + func: Callable[..., "Styler"], + axis: Optional[Axis] = 0, + subset=None, + **kwargs, + ) -> "Styler": subset = slice(None) if subset is None else subset subset = _non_reducing_slice(subset) data = self.data.loc[subset] @@ -645,7 +664,13 @@ def _apply(self, func, axis=0, subset=None, **kwargs): self._update_ctx(result) return self - def apply(self, func, axis=0, subset=None, **kwargs): + def apply( + self, + func: Callable[..., "Styler"], + axis: Optional[Axis] = 0, + subset=None, + **kwargs, + ) -> "Styler": """ Apply a function column-wise, row-wise, or table-wise. @@ -696,7 +721,7 @@ def apply(self, func, axis=0, subset=None, **kwargs): ) return self - def _applymap(self, func, subset=None, **kwargs): + def _applymap(self, func: Callable, subset=None, **kwargs) -> "Styler": func = partial(func, **kwargs) # applymap doesn't take kwargs? if subset is None: subset = pd.IndexSlice[:] @@ -705,7 +730,7 @@ def _applymap(self, func, subset=None, **kwargs): self._update_ctx(result) return self - def applymap(self, func, subset=None, **kwargs): + def applymap(self, func: Callable, subset=None, **kwargs) -> "Styler": """ Apply a function elementwise. @@ -734,7 +759,14 @@ def applymap(self, func, subset=None, **kwargs): ) return self - def where(self, cond, value, other=None, subset=None, **kwargs): + def where( + self, + cond: Callable, + value: str, + other: Optional[str] = None, + subset=None, + **kwargs, + ) -> "Styler": """ Apply a function elementwise. @@ -773,7 +805,7 @@ def where(self, cond, value, other=None, subset=None, **kwargs): lambda val: value if cond(val) else other, subset=subset, **kwargs ) - def set_precision(self, precision): + def set_precision(self, precision: int) -> "Styler": """ Set the precision used to render. @@ -788,7 +820,7 @@ def set_precision(self, precision): self.precision = precision return self - def set_table_attributes(self, attributes): + def set_table_attributes(self, attributes: str) -> "Styler": """ Set the table attributes. @@ -812,7 +844,7 @@ def set_table_attributes(self, attributes): self.table_attributes = attributes return self - def export(self): + def export(self) -> List[Tuple[Callable, Tuple, Dict]]: """ Export the styles to applied to the current Styler. @@ -828,7 +860,7 @@ def export(self): """ return self._todo - def use(self, styles): + def use(self, styles: List[Tuple[Callable, Tuple, Dict]]) -> "Styler": """ Set the styles on the current Styler. @@ -850,7 +882,7 @@ def use(self, styles): self._todo.extend(styles) return self - def set_uuid(self, uuid): + def set_uuid(self, uuid: str) -> "Styler": """ Set the uuid for a Styler. @@ -865,7 +897,7 @@ def set_uuid(self, uuid): self.uuid = uuid return self - def set_caption(self, caption): + def set_caption(self, caption: str) -> "Styler": """ Set the caption on a Styler. @@ -880,7 +912,7 @@ def set_caption(self, caption): self.caption = caption return self - def set_table_styles(self, table_styles): + def set_table_styles(self, table_styles) -> "Styler": """ Set the table styles on a Styler. @@ -927,7 +959,7 @@ def set_na_rep(self, na_rep: str) -> "Styler": self.na_rep = na_rep return self - def hide_index(self): + def hide_index(self) -> "Styler": """ Hide any indices from rendering. @@ -940,7 +972,7 @@ def hide_index(self): self.hidden_index = True return self - def hide_columns(self, subset): + def hide_columns(self, subset) -> "Styler": """ Hide columns from rendering. @@ -966,10 +998,10 @@ def hide_columns(self, subset): # ----------------------------------------------------------------------- @staticmethod - def _highlight_null(v, null_color): + def _highlight_null(v, null_color: str) -> str: return f"background-color: {null_color}" if pd.isna(v) else "" - def highlight_null(self, null_color="red"): + def highlight_null(self, null_color: str = "red") -> "Styler": """ Shade the background ``null_color`` for missing values. @@ -987,14 +1019,14 @@ def highlight_null(self, null_color="red"): def background_gradient( self, cmap="PuBu", - low=0, - high=0, - axis=0, + low: float = 0, + high: float = 0, + axis: Optional[Axis] = 0, subset=None, - text_color_threshold=0.408, + text_color_threshold: float = 0.408, vmin: Optional[float] = None, vmax: Optional[float] = None, - ): + ) -> "Styler": """ Color the background in a gradient style. @@ -1069,9 +1101,9 @@ def background_gradient( def _background_gradient( s, cmap="PuBu", - low=0, - high=0, - text_color_threshold=0.408, + low: float = 0, + high: float = 0, + text_color_threshold: float = 0.408, vmin: Optional[float] = None, vmax: Optional[float] = None, ): @@ -1095,7 +1127,7 @@ def _background_gradient( # https://github.com/matplotlib/matplotlib/issues/5427 rgbas = plt.cm.get_cmap(cmap)(norm(s.to_numpy(dtype=float))) - def relative_luminance(rgba): + def relative_luminance(rgba) -> float: """ Calculate relative luminance of a color. @@ -1117,7 +1149,7 @@ def relative_luminance(rgba): ) return 0.2126 * r + 0.7152 * g + 0.0722 * b - def css(rgba): + def css(rgba) -> str: dark = relative_luminance(rgba) < text_color_threshold text_color = "#f1f1f1" if dark else "#000000" return f"background-color: {colors.rgb2hex(rgba)};color: {text_color};" @@ -1131,7 +1163,7 @@ def css(rgba): columns=s.columns, ) - def set_properties(self, subset=None, **kwargs): + def set_properties(self, subset=None, **kwargs) -> "Styler": """ Method to set one or more non-data dependent properties or each cell. @@ -1157,7 +1189,14 @@ def set_properties(self, subset=None, **kwargs): return self.applymap(f, subset=subset) @staticmethod - def _bar(s, align, colors, width=100, vmin=None, vmax=None): + def _bar( + s, + align: str, + colors: List[str], + width: float = 100, + vmin: Optional[float] = None, + vmax: Optional[float] = None, + ): """ Draw bar chart in dataframe cells. """ @@ -1175,7 +1214,7 @@ def _bar(s, align, colors, width=100, vmin=None, vmax=None): normed = width * (s.to_numpy(dtype=float) - smin) / (smax - smin + 1e-12) zero = -width * smin / (smax - smin + 1e-12) - def css_bar(start, end, color): + def css_bar(start: float, end: float, color: str) -> str: """ Generate CSS code to draw a bar from start to end. """ @@ -1212,13 +1251,13 @@ def css(x): def bar( self, subset=None, - axis=0, + axis: Optional[Axis] = 0, color="#d65f5f", - width=100, - align="left", - vmin=None, - vmax=None, - ): + width: float = 100, + align: str = "left", + vmin: Optional[float] = None, + vmax: Optional[float] = None, + ) -> "Styler": """ Draw bar chart in the cell backgrounds. @@ -1293,7 +1332,9 @@ def bar( return self - def highlight_max(self, subset=None, color="yellow", axis=0): + def highlight_max( + self, subset=None, color: str = "yellow", axis: Optional[Axis] = 0 + ) -> "Styler": """ Highlight the maximum by shading the background. @@ -1313,7 +1354,9 @@ def highlight_max(self, subset=None, color="yellow", axis=0): """ return self._highlight_handler(subset=subset, color=color, axis=axis, max_=True) - def highlight_min(self, subset=None, color="yellow", axis=0): + def highlight_min( + self, subset=None, color: str = "yellow", axis: Optional[Axis] = 0 + ) -> "Styler": """ Highlight the minimum by shading the background. @@ -1335,7 +1378,13 @@ def highlight_min(self, subset=None, color="yellow", axis=0): subset=subset, color=color, axis=axis, max_=False ) - def _highlight_handler(self, subset=None, color="yellow", axis=None, max_=True): + def _highlight_handler( + self, + subset=None, + color: str = "yellow", + axis: Optional[Axis] = None, + max_: bool = True, + ) -> "Styler": subset = _non_reducing_slice(_maybe_numeric_slice(self.data, subset)) self.apply( self._highlight_extrema, color=color, axis=axis, subset=subset, max_=max_ @@ -1343,7 +1392,9 @@ def _highlight_handler(self, subset=None, color="yellow", axis=None, max_=True): return self @staticmethod - def _highlight_extrema(data, color="yellow", max_=True): + def _highlight_extrema( + data: FrameOrSeries, color: str = "yellow", max_: bool = True + ): """ Highlight the min or max in a Series or DataFrame. """ @@ -1388,7 +1439,7 @@ class MyStyler(cls): return MyStyler - def pipe(self, func, *args, **kwargs): + def pipe(self, func: Callable, *args, **kwargs): """ Apply ``func(self, *args, **kwargs)``, and return the result. @@ -1460,7 +1511,7 @@ def pipe(self, func, *args, **kwargs): return com.pipe(self, func, *args, **kwargs) -def _is_visible(idx_row, idx_col, lengths): +def _is_visible(idx_row, idx_col, lengths) -> bool: """ Index -> {(idx_row, idx_col): bool}). """ @@ -1510,7 +1561,9 @@ def _get_level_lengths(index, hidden_elements=None): return non_zero_lengths -def _maybe_wrap_formatter(formatter, na_rep: Optional[str]): +def _maybe_wrap_formatter( + formatter: Union[Callable, str], na_rep: Optional[str] +) -> Callable: if isinstance(formatter, str): formatter_func = lambda x: formatter.format(x) elif callable(formatter): From 3fbc33205ce4f700e2934d524a0f080af8548f19 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Mon, 20 Jan 2020 17:38:27 +0000 Subject: [PATCH 124/158] CLN: Remove unused release scripts (#31049) --- scripts/build_dist.sh | 18 ------------------ scripts/build_dist_for_release.sh | 10 ---------- 2 files changed, 28 deletions(-) delete mode 100755 scripts/build_dist.sh delete mode 100755 scripts/build_dist_for_release.sh diff --git a/scripts/build_dist.sh b/scripts/build_dist.sh deleted file mode 100755 index c3f849ce7a6eb..0000000000000 --- a/scripts/build_dist.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# build the distribution -LAST=`git tag --sort version:refname | grep -v rc | tail -1` - -echo "Building distribution for: $LAST" -git checkout $LAST - -read -p "Ok to continue (y/n)? " answer -case ${answer:0:1} in - y|Y ) - echo "Building distribution" - ./build_dist_for_release.sh - ;; - * ) - echo "Not building distribution" - ;; -esac diff --git a/scripts/build_dist_for_release.sh b/scripts/build_dist_for_release.sh deleted file mode 100755 index bee0f23a68ec2..0000000000000 --- a/scripts/build_dist_for_release.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -# this requires cython to be installed - -# this builds the release cleanly & is building on the current checkout -rm -rf dist -git clean -xfd -python setup.py clean --quiet -python setup.py cython --quiet -python setup.py sdist --formats=gztar --quiet From 7882aac45f92a0313e09e84dac0ac3f7d17ed899 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Mon, 20 Jan 2020 18:04:00 +0000 Subject: [PATCH 125/158] :pencil: set klass correctly for series and dataframe set_axis (#30885) --- pandas/core/frame.py | 40 +++++++++++++++++++++++++++++++ pandas/core/generic.py | 54 ++++-------------------------------------- pandas/core/series.py | 26 ++++++++++++++++++++ 3 files changed, 71 insertions(+), 49 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1a49388d81243..d7732f5f43492 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3824,6 +3824,46 @@ def align( broadcast_axis=broadcast_axis, ) + @Appender( + """ + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + + Change the row labels. + + >>> df.set_axis(['a', 'b', 'c'], axis='index') + A B + a 1 4 + b 2 5 + c 3 6 + + Change the column labels. + + >>> df.set_axis(['I', 'II'], axis='columns') + I II + 0 1 4 + 1 2 5 + 2 3 6 + + Now, update the labels inplace. + + >>> df.set_axis(['i', 'ii'], axis='columns', inplace=True) + >>> df + i ii + 0 1 4 + 1 2 5 + 2 3 6 + """ + ) + @Substitution( + **_shared_doc_kwargs, + extended_summary_sub=" column or", + axis_description_sub=", and 1 identifies the columns", + see_also_sub=" or columns", + ) + @Appender(NDFrame.set_axis.__doc__) + def set_axis(self, labels, axis=0, inplace=False): + return super().set_axis(labels, axis=axis, inplace=inplace) + @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.reindex.__doc__) @rewrite_axis_style_signature( diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0c5c119468994..7b216c53c68cf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -559,7 +559,7 @@ def set_axis(self, labels, axis=0, inplace=False): """ Assign desired index to given axis. - Indexes for column or row labels can be changed by assigning + Indexes for%(extended_summary_sub)s row labels can be changed by assigning a list-like or Index. .. versionchanged:: 0.21.0 @@ -574,9 +574,8 @@ def set_axis(self, labels, axis=0, inplace=False): labels : list-like, Index The values for the new index. - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to update. The value 0 identifies the rows, and 1 - identifies the columns. + axis : %(axes_single_arg)s, default 0 + The axis to update. The value 0 identifies the rows%(axis_description_sub)s. inplace : bool, default False Whether to return a new %(klass)s instance. @@ -584,57 +583,14 @@ def set_axis(self, labels, axis=0, inplace=False): Returns ------- renamed : %(klass)s or None - An object of same type as caller if inplace=False, None otherwise. + An object of type %(klass)s if inplace=False, None otherwise. See Also -------- - DataFrame.rename_axis : Alter the name of the index or columns. + %(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s. Examples -------- - **Series** - - >>> s = pd.Series([1, 2, 3]) - >>> s - 0 1 - 1 2 - 2 3 - dtype: int64 - - >>> s.set_axis(['a', 'b', 'c'], axis=0) - a 1 - b 2 - c 3 - dtype: int64 - - **DataFrame** - - >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - - Change the row labels. - - >>> df.set_axis(['a', 'b', 'c'], axis='index') - A B - a 1 4 - b 2 5 - c 3 6 - - Change the column labels. - - >>> df.set_axis(['I', 'II'], axis='columns') - I II - 0 1 4 - 1 2 5 - 2 3 6 - - Now, update the labels inplace. - - >>> df.set_axis(['i', 'ii'], axis='columns', inplace=True) - >>> df - i ii - 0 1 4 - 1 2 5 - 2 3 6 """ if inplace: setattr(self, self._get_axis_name(axis), labels) diff --git a/pandas/core/series.py b/pandas/core/series.py index 580e3745136d7..ffe0642f799fa 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3948,6 +3948,32 @@ def rename( else: return self._set_name(index, inplace=inplace) + @Appender( + """ + >>> s = pd.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + + >>> s.set_axis(['a', 'b', 'c'], axis=0) + a 1 + b 2 + c 3 + dtype: int64 + """ + ) + @Substitution( + **_shared_doc_kwargs, + extended_summary_sub="", + axis_description_sub="", + see_also_sub="", + ) + @Appender(generic.NDFrame.set_axis.__doc__) + def set_axis(self, labels, axis=0, inplace=False): + return super().set_axis(labels, axis=axis, inplace=inplace) + @Substitution(**_shared_doc_kwargs) @Appender(generic.NDFrame.reindex.__doc__) def reindex(self, index=None, **kwargs): From 15bacea86844237a9e5290446612ebe3ea712d84 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Mon, 20 Jan 2020 18:11:01 +0000 Subject: [PATCH 126/158] =?UTF-8?q?raise=20more=20specific=20error=20if=20?= =?UTF-8?q?dict=20is=20appended=20to=20frame=20wit=E2=80=A6=20(#30882)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/source/whatsnew/v1.1.0.rst | 3 ++- pandas/core/frame.py | 2 ++ pandas/tests/frame/methods/test_append.py | 4 ++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 40c02eb495f67..294183e24c96f 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -159,7 +159,8 @@ ExtensionArray Other ^^^^^ -- +- Appending a dictionary to a :class:`DataFrame` without passing ``ignore_index=True`` will raise ``TypeError: Can only append a dict if ignore_index=True`` + instead of ``TypeError: Can only append a Series if ignore_index=True or if the Series has a name`` (:issue:`30871`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d7732f5f43492..fa9a951d6849c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7102,6 +7102,8 @@ def append( """ if isinstance(other, (Series, dict)): if isinstance(other, dict): + if not ignore_index: + raise TypeError("Can only append a dict if ignore_index=True") other = Series(other) if other.name is None and not ignore_index: raise TypeError( diff --git a/pandas/tests/frame/methods/test_append.py b/pandas/tests/frame/methods/test_append.py index d128a51f4b390..9fc3629e794e2 100644 --- a/pandas/tests/frame/methods/test_append.py +++ b/pandas/tests/frame/methods/test_append.py @@ -50,6 +50,10 @@ def test_append_series_dict(self): ) tm.assert_frame_equal(result, expected.loc[:, result.columns]) + msg = "Can only append a dict if ignore_index=True" + with pytest.raises(TypeError, match=msg): + df.append(series.to_dict()) + # can append when name set row = df.loc[4] row.name = 5 From e0bd3945ad5261bf32527729cfebd92355a684fe Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 21 Jan 2020 06:52:08 +1100 Subject: [PATCH 127/158] ENH Avoid redundant CSS in Styler.render (#30876) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/io/formats/style.py | 13 +++++++++---- pandas/io/formats/templates/html.tpl | 2 +- pandas/tests/io/formats/test_style.py | 15 +++++++++++++-- 4 files changed, 24 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 294183e24c96f..01c089b46b4a1 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -18,6 +18,7 @@ Enhancements Other enhancements ^^^^^^^^^^^^^^^^^^ +- :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`) - - diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 4f2430b6c8568..565752e269d79 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -286,7 +286,7 @@ def format_attr(pair): clabels = [[x] for x in clabels] clabels = list(zip(*clabels)) - cellstyle = [] + cellstyle_map = defaultdict(list) head = [] for r in range(n_clvls): @@ -408,12 +408,17 @@ def format_attr(pair): for x in ctx[r, c]: # have to handle empty styles like [''] if x.count(":"): - props.append(x.split(":")) + props.append(tuple(x.split(":"))) else: - props.append(["", ""]) - cellstyle.append({"props": props, "selector": f"row{r}_col{c}"}) + props.append(("", "")) + cellstyle_map[tuple(props)].append(f"row{r}_col{c}") body.append(row_es) + cellstyle = [ + {"props": list(props), "selectors": selectors} + for props, selectors in cellstyle_map.items() + ] + table_attr = self.table_attributes use_mathjax = get_option("display.html.use_mathjax") if not use_mathjax: diff --git a/pandas/io/formats/templates/html.tpl b/pandas/io/formats/templates/html.tpl index 15feafcea6864..97bfda9af089d 100644 --- a/pandas/io/formats/templates/html.tpl +++ b/pandas/io/formats/templates/html.tpl @@ -14,7 +14,7 @@ {% block before_cellstyle %}{% endblock before_cellstyle %} {% block cellstyle %} {%- for s in cellstyle %} - #T_{{uuid}}{{s.selector}} { + {%- for selector in s.selectors -%}{%- if not loop.first -%},{%- endif -%}#T_{{uuid}}{{selector}}{%- endfor -%} { {% for p,val in s.props %} {{p}}: {{val}}; {% endfor %} diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index e5dac18acedf6..a2659079be7c0 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -472,8 +472,19 @@ def test_empty(self): result = s._translate()["cellstyle"] expected = [ - {"props": [["color", " red"]], "selector": "row0_col0"}, - {"props": [["", ""]], "selector": "row1_col0"}, + {"props": [("color", " red")], "selectors": ["row0_col0"]}, + {"props": [("", "")], "selectors": ["row1_col0"]}, + ] + assert result == expected + + def test_duplicate(self): + df = pd.DataFrame({"A": [1, 0]}) + s = df.style + s.ctx = {(0, 0): ["color: red"], (1, 0): ["color: red"]} + + result = s._translate()["cellstyle"] + expected = [ + {"props": [("color", " red")], "selectors": ["row0_col0", "row1_col0"]} ] assert result == expected From cd20c954542c56104aefb7898e6084e7e92c0a2b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 20 Jan 2020 11:53:19 -0800 Subject: [PATCH 128/158] REF: use _get_string_slice in PeriodIndex.get_value (#31058) --- pandas/core/indexes/period.py | 85 ++++++++++++----------------------- 1 file changed, 29 insertions(+), 56 deletions(-) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 9d501b2601c09..b3386f6104032 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -483,25 +483,20 @@ def get_value(self, series, key): return series.iat[key] if isinstance(key, str): + try: + loc = self._get_string_slice(key) + return series[loc] + except (TypeError, ValueError): + pass + asdt, reso = parse_time_string(key, self.freq) grp = resolution.Resolution.get_freq_group(reso) freqn = resolution.get_freq_group(self.freq) - vals = self._ndarray_values - - # if our data is higher resolution than requested key, slice - if grp < freqn: - iv = Period(asdt, freq=(grp, 1)) - ord1 = iv.asfreq(self.freq, how="S").ordinal - ord2 = iv.asfreq(self.freq, how="E").ordinal + # _get_string_slice will handle cases where grp < freqn + assert grp >= freqn - if ord2 < vals[0] or ord1 > vals[-1]: - raise KeyError(key) - - pos = np.searchsorted(self._ndarray_values, [ord1, ord2]) - key = slice(pos[0], pos[1] + 1) - return series[key] - elif grp == freqn: + if grp == freqn: key = Period(asdt, freq=self.freq) loc = self.get_loc(key) return series.iloc[loc] @@ -643,61 +638,39 @@ def _maybe_cast_slice_bound(self, label, side, kind): return label - def _parsed_string_to_bounds(self, reso, parsed): - if reso == "year": - t1 = Period(year=parsed.year, freq="A") - elif reso == "month": - t1 = Period(year=parsed.year, month=parsed.month, freq="M") - elif reso == "quarter": - q = (parsed.month - 1) // 3 + 1 - t1 = Period(year=parsed.year, quarter=q, freq="Q-DEC") - elif reso == "day": - t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day, freq="D") - elif reso == "hour": - t1 = Period( - year=parsed.year, - month=parsed.month, - day=parsed.day, - hour=parsed.hour, - freq="H", - ) - elif reso == "minute": - t1 = Period( - year=parsed.year, - month=parsed.month, - day=parsed.day, - hour=parsed.hour, - minute=parsed.minute, - freq="T", - ) - elif reso == "second": - t1 = Period( - year=parsed.year, - month=parsed.month, - day=parsed.day, - hour=parsed.hour, - minute=parsed.minute, - second=parsed.second, - freq="S", - ) - else: + def _parsed_string_to_bounds(self, reso: str, parsed: datetime): + if reso not in ["year", "month", "quarter", "day", "hour", "minute", "second"]: raise KeyError(reso) - return (t1.asfreq(self.freq, how="start"), t1.asfreq(self.freq, how="end")) + + grp = resolution.Resolution.get_freq_group(reso) + iv = Period(parsed, freq=(grp, 1)) + return (iv.asfreq(self.freq, how="start"), iv.asfreq(self.freq, how="end")) def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True): # TODO: Check for non-True use_lhs/use_rhs + raw = key if not self.is_monotonic: raise ValueError("Partial indexing only valid for ordered time series") parsed, reso = parse_time_string(key, self.freq) grp = resolution.Resolution.get_freq_group(reso) freqn = resolution.get_freq_group(self.freq) - if reso in ["day", "hour", "minute", "second"] and not grp < freqn: - raise KeyError(key) + + if not grp < freqn: + # TODO: we used to also check for + # reso in ["day", "hour", "minute", "second"] + # why is that check not needed? + raise TypeError(key) t1, t2 = self._parsed_string_to_bounds(reso, parsed) + if len(self): + if t2 < self.min() or t1 > self.max(): + raise KeyError(raw) + + # Use asi8 searchsorted to avoid overhead of re-validating inputs return slice( - self.searchsorted(t1, side="left"), self.searchsorted(t2, side="right") + self.asi8.searchsorted(t1.ordinal, side="left"), + self.asi8.searchsorted(t2.ordinal, side="right"), ) def _convert_tolerance(self, tolerance, target): From f1aaf62f181e19d716bc0ade018a8081c1c02d83 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Mon, 20 Jan 2020 21:08:11 +0100 Subject: [PATCH 129/158] BUG: df.pivot_table fails when margin is True and only columns is defined (#31088) --- asv_bench/benchmarks/reshape.py | 3 ++ doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/reshape/pivot.py | 36 ++++++++----------- pandas/tests/reshape/test_pivot.py | 58 ++++++++++++++++++++++++++++++ 4 files changed, 77 insertions(+), 21 deletions(-) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 441f4b380656e..21081ee23a773 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -161,6 +161,9 @@ def time_pivot_table_categorical_observed(self): observed=True, ) + def time_pivot_table_margins_only_column(self): + self.df.pivot_table(columns=["key2", "key3"], margins=True) + class Crosstab: def setup(self): diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 01c089b46b4a1..b7adf7bf0d80d 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -141,6 +141,7 @@ Reshaping - - Bug in :meth:`DataFrame.pivot_table` when only MultiIndexed columns is set (:issue:`17038`) +- Bug in :meth:`DataFrame.pivot_table` when ``margin`` is ``True`` and only ``column`` is defined (:issue:`31016`) - Fix incorrect error message in :meth:`DataFrame.pivot` when ``columns`` is set to ``None``. (:issue:`30924`) - Bug in :func:`crosstab` when inputs are two Series and have tuple names, the output will keep dummy MultiIndex as columns. (:issue:`18321`) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 930ff5f454a7b..e250a072766e3 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -226,15 +226,7 @@ def _add_margins( elif values: marginal_result_set = _generate_marginal_results( - table, - data, - values, - rows, - cols, - aggfunc, - observed, - grand_margin, - margins_name, + table, data, values, rows, cols, aggfunc, observed, margins_name, ) if not isinstance(marginal_result_set, tuple): return marginal_result_set @@ -303,15 +295,7 @@ def _compute_grand_margin(data, values, aggfunc, margins_name: str = "All"): def _generate_marginal_results( - table, - data, - values, - rows, - cols, - aggfunc, - observed, - grand_margin, - margins_name: str = "All", + table, data, values, rows, cols, aggfunc, observed, margins_name: str = "All", ): if len(cols) > 0: # need to "interleave" the margins @@ -345,12 +329,22 @@ def _all_key(key): table_pieces.append(piece) margin_keys.append(all_key) else: - margin = grand_margin + from pandas import DataFrame + cat_axis = 0 for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed): - all_key = _all_key(key) + if len(cols) > 1: + all_key = _all_key(key) + else: + all_key = margins_name table_pieces.append(piece) - table_pieces.append(Series(margin[key], index=[all_key])) + # GH31016 this is to calculate margin for each group, and assign + # corresponded key as index + transformed_piece = DataFrame(piece.apply(aggfunc)).T + transformed_piece.index = Index([all_key], name=piece.index.name) + + # append piece for margin into table_piece + table_pieces.append(transformed_piece) margin_keys.append(all_key) result = concat(table_pieces, axis=cat_axis) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 44073f56abfa1..6850c52ca05ea 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -910,6 +910,64 @@ def _check_output( totals = table.loc[("All", ""), item] assert totals == self.data[item].mean() + @pytest.mark.parametrize( + "columns, aggfunc, values, expected_columns", + [ + ( + "A", + np.mean, + [[5.5, 5.5, 2.2, 2.2], [8.0, 8.0, 4.4, 4.4]], + Index(["bar", "All", "foo", "All"], name="A"), + ), + ( + ["A", "B"], + "sum", + [[9, 13, 22, 5, 6, 11], [14, 18, 32, 11, 11, 22]], + MultiIndex.from_tuples( + [ + ("bar", "one"), + ("bar", "two"), + ("bar", "All"), + ("foo", "one"), + ("foo", "two"), + ("foo", "All"), + ], + names=["A", "B"], + ), + ), + ], + ) + def test_margin_with_only_columns_defined( + self, columns, aggfunc, values, expected_columns + ): + # GH 31016 + df = pd.DataFrame( + { + "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], + "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], + "C": [ + "small", + "large", + "large", + "small", + "small", + "large", + "small", + "small", + "large", + ], + "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], + } + ) + + result = df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc) + expected = pd.DataFrame( + values, index=Index(["D", "E"]), columns=expected_columns + ) + + tm.assert_frame_equal(result, expected) + def test_margins_dtype(self): # GH 17013 From 29edd119d31a9ee7d4f89e8c1dc8af96f0c19dce Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 20 Jan 2020 16:25:32 -0600 Subject: [PATCH 130/158] BUG: Break reference from grouping level to MI (#31133) --- pandas/core/indexes/multi.py | 4 ++++ pandas/tests/groupby/test_apply.py | 17 +++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 8682af6ab6369..b684908c25fe5 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1256,6 +1256,10 @@ def _get_grouper_for_level(self, mapper, level): if len(uniques) < len(level_index): # Remove unobserved levels from level_index level_index = level_index.take(uniques) + else: + # break references back to us so that setting the name + # on the output of a groupby doesn't reflect back here. + level_index = level_index.copy() if len(level_index): grouper = level_index.take(codes) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index e81ff37510dc0..708d3429285a8 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -775,3 +775,20 @@ def most_common_values(df): ["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId" ) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("category", [False, True]) +def test_apply_multi_level_name(category): + # https://github.com/pandas-dev/pandas/issues/31068 + b = [1, 2] * 5 + if category: + b = pd.Categorical(b, categories=[1, 2, 3]) + df = pd.DataFrame( + {"A": np.arange(10), "B": b, "C": list(range(10)), "D": list(range(10))} + ).set_index(["A", "B"]) + result = df.groupby("B").apply(lambda x: x.sum()) + expected = pd.DataFrame( + {"C": [20, 25], "D": [20, 25]}, index=pd.Index([1, 2], name="B") + ) + tm.assert_frame_equal(result, expected) + assert df.index.names == ["A", "B"] From 469b4b71fd2eaf9c04b40e270ceec2f1dd7961ce Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Tue, 21 Jan 2020 00:28:00 +0200 Subject: [PATCH 131/158] CLN/STY: various code cleanups (#31162) --- pandas/core/computation/align.py | 3 +- pandas/core/computation/eval.py | 45 +++++++++++-------------- pandas/core/computation/expressions.py | 21 +++++------- pandas/core/frame.py | 33 ++++-------------- pandas/tests/frame/methods/test_diff.py | 2 +- pandas/tests/test_take.py | 4 +-- 6 files changed, 39 insertions(+), 69 deletions(-) diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index a1b1cffdd1d76..e45d3ca66b6ec 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -1,4 +1,5 @@ -"""Core eval alignment algorithms +""" +Core eval alignment algorithms. """ from functools import partial, wraps diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 71e1b6c2a08a9..4cdf4bac61316 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 - """ Top level ``eval`` module. """ @@ -26,30 +25,29 @@ def _check_engine(engine: Optional[str]) -> str: Parameters ---------- engine : str + String to validate. Raises ------ KeyError - * If an invalid engine is passed + * If an invalid engine is passed. ImportError - * If numexpr was requested but doesn't exist + * If numexpr was requested but doesn't exist. Returns ------- - string engine + str + Engine name. """ from pandas.core.computation.check import _NUMEXPR_INSTALLED if engine is None: - if _NUMEXPR_INSTALLED: - engine = "numexpr" - else: - engine = "python" + engine = "numexpr" if _NUMEXPR_INSTALLED else "python" if engine not in _engines: - valid = list(_engines.keys()) + valid_engines = list(_engines.keys()) raise KeyError( - f"Invalid engine {repr(engine)} passed, valid engines are {valid}" + f"Invalid engine '{engine}' passed, valid engines are {valid_engines}" ) # TODO: validate this in a more general way (thinking of future engines @@ -58,10 +56,8 @@ def _check_engine(engine: Optional[str]) -> str: if engine == "numexpr": if not _NUMEXPR_INSTALLED: raise ImportError( - "'numexpr' is not installed or an " - "unsupported version. Cannot use " - "engine='numexpr' for query/eval " - "if 'numexpr' is not installed" + "'numexpr' is not installed or an unsupported version. Cannot use " + "engine='numexpr' for query/eval if 'numexpr' is not installed" ) return engine @@ -80,11 +76,9 @@ def _check_parser(parser: str): KeyError * If an invalid parser is passed """ - if parser not in _parsers: raise KeyError( - f"Invalid parser {repr(parser)} passed, " - f"valid parsers are {_parsers.keys()}" + f"Invalid parser '{parser}' passed, valid parsers are {_parsers.keys()}" ) @@ -94,8 +88,8 @@ def _check_resolvers(resolvers): if not hasattr(resolver, "__getitem__"): name = type(resolver).__name__ raise TypeError( - f"Resolver of type {repr(name)} does not " - f"implement the __getitem__ method" + f"Resolver of type '{name}' does not " + "implement the __getitem__ method" ) @@ -155,10 +149,8 @@ def _check_for_locals(expr: str, stack_level: int, parser: str): msg = "The '@' prefix is only supported by the pandas parser" elif at_top_of_stack: msg = ( - "The '@' prefix is not allowed in " - "top-level eval calls, \nplease refer to " - "your variables by name without the '@' " - "prefix" + "The '@' prefix is not allowed in top-level eval calls.\n" + "please refer to your variables by name without the '@' prefix." ) if at_top_of_stack or not_pandas_parser: @@ -285,13 +277,14 @@ def eval( See the :ref:`enhancing performance ` documentation for more details. """ - inplace = validate_bool_kwarg(inplace, "inplace") if truediv is not no_default: warnings.warn( - "The `truediv` parameter in pd.eval is deprecated and will be " - "removed in a future version.", + ( + "The `truediv` parameter in pd.eval is deprecated and " + "will be removed in a future version." + ), FutureWarning, stacklevel=2, ) diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index 7e959889ee997..ada983e9e4fad 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -45,12 +45,9 @@ def set_use_numexpr(v=True): # choose what we are going to do global _evaluate, _where - if not _USE_NUMEXPR: - _evaluate = _evaluate_standard - _where = _where_standard - else: - _evaluate = _evaluate_numexpr - _where = _where_numexpr + + _evaluate = _evaluate_numexpr if _USE_NUMEXPR else _evaluate_standard + _where = _where_numexpr if _USE_NUMEXPR else _where_standard def set_numexpr_threads(n=None): @@ -63,7 +60,9 @@ def set_numexpr_threads(n=None): def _evaluate_standard(op, op_str, a, b): - """ standard evaluation """ + """ + Standard evaluation. + """ if _TEST_MODE: _store_test_result(False) with np.errstate(all="ignore"): @@ -176,7 +175,7 @@ def _bool_arith_check( if op_str in unsupported: warnings.warn( f"evaluating in Python space because the {repr(op_str)} " - f"operator is not supported by numexpr for " + "operator is not supported by numexpr for " f"the bool dtype, use {repr(unsupported[op_str])} instead" ) return False @@ -202,7 +201,6 @@ def evaluate(op, op_str, a, b, use_numexpr=True): use_numexpr : bool, default True Whether to try to use numexpr. """ - use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b) if use_numexpr: return _evaluate(op, op_str, a, b) @@ -221,10 +219,7 @@ def where(cond, a, b, use_numexpr=True): use_numexpr : bool, default True Whether to try to use numexpr. """ - - if use_numexpr: - return _where(cond, a, b) - return _where_standard(cond, a, b) + return _where(cond, a, b) if use_numexpr else _where_standard(cond, a, b) def set_test_mode(v=True): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fa9a951d6849c..4257083cc8dc5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8,6 +8,7 @@ alignment and a host of useful data manipulation methods having to do with the labeling information """ + import collections from collections import abc from io import StringIO @@ -258,7 +259,6 @@ Examples -------- - >>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], ... 'value': [1, 2, 3, 5]}) >>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], @@ -491,12 +491,12 @@ def __init__( else: try: arr = np.array(data, dtype=dtype, copy=copy) - except (ValueError, TypeError) as e: + except (ValueError, TypeError) as err: exc = TypeError( "DataFrame constructor called with " - f"incompatible data and dtype: {e}" + f"incompatible data and dtype: {err}" ) - raise exc from e + raise exc from err if arr.ndim == 0 and index is not None and columns is not None: values = cast_scalar_to_array( @@ -794,7 +794,6 @@ def to_string( 1 2 5 2 3 6 """ - from pandas import option_context with option_context("display.max_colwidth", max_colwidth): @@ -1583,7 +1582,6 @@ def from_records( ------- DataFrame """ - # Make a copy of the input columns so we can modify it if columns is not None: columns = ensure_index(columns) @@ -1764,7 +1762,6 @@ def to_records( rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)], dtype=[('I', 'S1'), ('A', ' "DataFrame": 4 True 1.0 5 False 2.0 """ - if not is_list_like(include): include = (include,) if include is not None else () if not is_list_like(exclude): @@ -3685,11 +3677,7 @@ def lookup(self, row_labels, col_labels) -> np.ndarray: Returns ------- numpy.ndarray - - Examples - -------- - values : ndarray - The found values + The found values. """ n = len(row_labels) if n != len(col_labels): @@ -3780,7 +3768,6 @@ def _reindex_multi(self, axes, copy, fill_value) -> "DataFrame": """ We are guaranteed non-Nones in the axes. """ - new_index, row_indexer = self.index.reindex(axes["index"]) new_columns, col_indexer = self.columns.reindex(axes["columns"]) @@ -4101,7 +4088,6 @@ def rename( Examples -------- - ``DataFrame.rename`` supports two calling conventions * ``(index=index_mapper, columns=columns_mapper, ...)`` @@ -5591,7 +5577,6 @@ def combine_first(self, other: "DataFrame") -> "DataFrame": Examples -------- - >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]}) >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) >>> df1.combine_first(df2) @@ -6370,7 +6355,6 @@ def explode(self, column: Union[str, Tuple]) -> "DataFrame": 3 3 1 3 4 1 """ - if not (is_scalar(column) or isinstance(column, tuple)): raise ValueError("column must be a scalar") if not self.columns.is_unique: @@ -6855,7 +6839,6 @@ def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): Examples -------- - >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B']) >>> df A B @@ -7050,7 +7033,6 @@ def append( Examples -------- - >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB')) >>> df A B @@ -8432,7 +8414,6 @@ def isin(self, values) -> "DataFrame": Examples -------- - >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, ... index=['falcon', 'dog']) >>> df @@ -8493,7 +8474,7 @@ def isin(self, values) -> "DataFrame": raise TypeError( "only list-like or dict-like objects are allowed " "to be passed to DataFrame.isin(), " - f"you passed a {repr(type(values).__name__)}" + f"you passed a '{type(values).__name__}'" ) return DataFrame( algorithms.isin(self.values.ravel(), values).reshape(self.shape), diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py index 43c25f4c05c2d..ffdb6d41ebda5 100644 --- a/pandas/tests/frame/methods/test_diff.py +++ b/pandas/tests/frame/methods/test_diff.py @@ -15,7 +15,7 @@ def test_diff(self, datetime_frame): ) # int dtype - a = 10000000000000000 + a = 10_000_000_000_000_000 b = a + 1 s = Series([a, b]) diff --git a/pandas/tests/test_take.py b/pandas/tests/test_take.py index 1d2ab9358c01c..2534f1849cf61 100644 --- a/pandas/tests/test_take.py +++ b/pandas/tests/test_take.py @@ -345,7 +345,7 @@ def test_2d_float32(self): def test_2d_datetime64(self): # 2005/01/01 - 2006/01/01 - arr = np.random.randint(11045376, 11360736, (5, 3)) * 100000000000 + arr = np.random.randint(11_045_376, 11_360_736, (5, 3)) * 100_000_000_000 arr = arr.view(dtype="datetime64[ns]") indexer = [0, 2, -1, 1, -1] @@ -452,7 +452,7 @@ def test_take_empty(self, allow_fill): tm.assert_numpy_array_equal(arr, result) msg = ( - r"cannot do a non-empty take from an empty axes.|" + "cannot do a non-empty take from an empty axes.|" "indices are out-of-bounds" ) with pytest.raises(IndexError, match=msg): From 24d7c06130f9c2aeebedc26971b244ce076f7d0a Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Tue, 21 Jan 2020 00:15:13 +0100 Subject: [PATCH 132/158] BUG: Fix MutliIndexed unstack failures at tuple names (#30943) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/reshape/reshape.py | 4 + pandas/tests/frame/test_reshape.py | 74 ++++++++++++++++ pandas/tests/series/test_analytics.py | 61 +------------ pandas/tests/series/test_reshaping.py | 120 ++++++++++++++++++++++++++ 5 files changed, 200 insertions(+), 60 deletions(-) create mode 100644 pandas/tests/series/test_reshaping.py diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b7adf7bf0d80d..e349745d51e83 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -141,6 +141,7 @@ Reshaping - - Bug in :meth:`DataFrame.pivot_table` when only MultiIndexed columns is set (:issue:`17038`) +- Bug in :meth:`DataFrame.unstack` and :meth:`Series.unstack` can take tuple names in MultiIndexed data (:issue:`19966`) - Bug in :meth:`DataFrame.pivot_table` when ``margin`` is ``True`` and only ``column`` is defined (:issue:`31016`) - Fix incorrect error message in :meth:`DataFrame.pivot` when ``columns`` is set to ``None``. (:issue:`30924`) - Bug in :func:`crosstab` when inputs are two Series and have tuple names, the output will keep dummy MultiIndex as columns. (:issue:`18321`) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 97f416e32d07b..fab9f41cb6c4f 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -317,6 +317,10 @@ def _unstack_multiple(data, clocs, fill_value=None): index = data.index + # GH 19966 Make sure if MultiIndexed index has tuple name, they will be + # recognised as a whole + if clocs in index.names: + clocs = [clocs] clocs = [index._get_level_number(i) for i in clocs] rlocs = [i for i in range(index.nlevels) if i not in clocs] diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 60b7611c8b9be..b3af5a7b7317e 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -336,6 +336,80 @@ def test_unstack_fill_frame_categorical(self): ) tm.assert_frame_equal(result, expected) + def test_unstack_tuplename_in_multiindex(self): + # GH 19966 + idx = pd.MultiIndex.from_product( + [["a", "b", "c"], [1, 2, 3]], names=[("A", "a"), ("B", "b")] + ) + df = pd.DataFrame({"d": [1] * 9, "e": [2] * 9}, index=idx) + result = df.unstack(("A", "a")) + + expected = pd.DataFrame( + [[1, 1, 1, 2, 2, 2], [1, 1, 1, 2, 2, 2], [1, 1, 1, 2, 2, 2]], + columns=pd.MultiIndex.from_tuples( + [ + ("d", "a"), + ("d", "b"), + ("d", "c"), + ("e", "a"), + ("e", "b"), + ("e", "c"), + ], + names=[None, ("A", "a")], + ), + index=pd.Index([1, 2, 3], name=("B", "b")), + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "unstack_idx, expected_values, expected_index, expected_columns", + [ + ( + ("A", "a"), + [[1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2]], + pd.MultiIndex.from_tuples( + [(1, 3), (1, 4), (2, 3), (2, 4)], names=["B", "C"] + ), + pd.MultiIndex.from_tuples( + [("d", "a"), ("d", "b"), ("e", "a"), ("e", "b")], + names=[None, ("A", "a")], + ), + ), + ( + (("A", "a"), "B"), + [[1, 1, 1, 1, 2, 2, 2, 2], [1, 1, 1, 1, 2, 2, 2, 2]], + pd.Index([3, 4], name="C"), + pd.MultiIndex.from_tuples( + [ + ("d", "a", 1), + ("d", "a", 2), + ("d", "b", 1), + ("d", "b", 2), + ("e", "a", 1), + ("e", "a", 2), + ("e", "b", 1), + ("e", "b", 2), + ], + names=[None, ("A", "a"), "B"], + ), + ), + ], + ) + def test_unstack_mixed_type_name_in_multiindex( + self, unstack_idx, expected_values, expected_index, expected_columns + ): + # GH 19966 + idx = pd.MultiIndex.from_product( + [["a", "b"], [1, 2], [3, 4]], names=[("A", "a"), "B", "C"] + ) + df = pd.DataFrame({"d": [1] * 8, "e": [2] * 8}, index=idx) + result = df.unstack(unstack_idx) + + expected = pd.DataFrame( + expected_values, columns=expected_columns, index=expected_index, + ) + tm.assert_frame_equal(result, expected) + def test_unstack_preserve_dtypes(self): # Checks fix for #11847 df = pd.DataFrame( diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index c29bd3ea0cb7d..e6e91b5d4f5f4 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -6,7 +6,7 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import DataFrame, MultiIndex, Series +from pandas import DataFrame, Series import pandas._testing as tm @@ -160,65 +160,6 @@ def test_is_monotonic(self): assert s.is_monotonic is False assert s.is_monotonic_decreasing is True - def test_unstack(self): - - index = MultiIndex( - levels=[["bar", "foo"], ["one", "three", "two"]], - codes=[[1, 1, 0, 0], [0, 1, 0, 2]], - ) - - s = Series(np.arange(4.0), index=index) - unstacked = s.unstack() - - expected = DataFrame( - [[2.0, np.nan, 3.0], [0.0, 1.0, np.nan]], - index=["bar", "foo"], - columns=["one", "three", "two"], - ) - - tm.assert_frame_equal(unstacked, expected) - - unstacked = s.unstack(level=0) - tm.assert_frame_equal(unstacked, expected.T) - - index = MultiIndex( - levels=[["bar"], ["one", "two", "three"], [0, 1]], - codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], - ) - s = Series(np.random.randn(6), index=index) - exp_index = MultiIndex( - levels=[["one", "two", "three"], [0, 1]], - codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], - ) - expected = DataFrame({"bar": s.values}, index=exp_index).sort_index(level=0) - unstacked = s.unstack(0).sort_index() - tm.assert_frame_equal(unstacked, expected) - - # GH5873 - idx = pd.MultiIndex.from_arrays([[101, 102], [3.5, np.nan]]) - ts = pd.Series([1, 2], index=idx) - left = ts.unstack() - right = DataFrame( - [[np.nan, 1], [2, np.nan]], index=[101, 102], columns=[np.nan, 3.5] - ) - tm.assert_frame_equal(left, right) - - idx = pd.MultiIndex.from_arrays( - [ - ["cat", "cat", "cat", "dog", "dog"], - ["a", "a", "b", "a", "b"], - [1, 2, 1, 1, np.nan], - ] - ) - ts = pd.Series([1.0, 1.1, 1.2, 1.3, 1.4], index=idx) - right = DataFrame( - [[1.0, 1.3], [1.1, np.nan], [np.nan, 1.4], [1.2, np.nan]], - columns=["cat", "dog"], - ) - tpls = [("a", 1), ("a", 2), ("b", np.nan), ("b", 1)] - right.index = pd.MultiIndex.from_tuples(tpls) - tm.assert_frame_equal(ts.unstack(level=0), right) - @pytest.mark.parametrize("func", [np.any, np.all]) @pytest.mark.parametrize("kwargs", [dict(keepdims=True), dict(out=object())]) @td.skip_if_np_lt("1.15") diff --git a/pandas/tests/series/test_reshaping.py b/pandas/tests/series/test_reshaping.py new file mode 100644 index 0000000000000..7645fb8759a54 --- /dev/null +++ b/pandas/tests/series/test_reshaping.py @@ -0,0 +1,120 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, MultiIndex, Series +import pandas._testing as tm + + +def test_unstack(): + index = MultiIndex( + levels=[["bar", "foo"], ["one", "three", "two"]], + codes=[[1, 1, 0, 0], [0, 1, 0, 2]], + ) + + s = Series(np.arange(4.0), index=index) + unstacked = s.unstack() + + expected = DataFrame( + [[2.0, np.nan, 3.0], [0.0, 1.0, np.nan]], + index=["bar", "foo"], + columns=["one", "three", "two"], + ) + + tm.assert_frame_equal(unstacked, expected) + + unstacked = s.unstack(level=0) + tm.assert_frame_equal(unstacked, expected.T) + + index = MultiIndex( + levels=[["bar"], ["one", "two", "three"], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + ) + s = Series(np.random.randn(6), index=index) + exp_index = MultiIndex( + levels=[["one", "two", "three"], [0, 1]], + codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + ) + expected = DataFrame({"bar": s.values}, index=exp_index).sort_index(level=0) + unstacked = s.unstack(0).sort_index() + tm.assert_frame_equal(unstacked, expected) + + # GH5873 + idx = pd.MultiIndex.from_arrays([[101, 102], [3.5, np.nan]]) + ts = pd.Series([1, 2], index=idx) + left = ts.unstack() + right = DataFrame( + [[np.nan, 1], [2, np.nan]], index=[101, 102], columns=[np.nan, 3.5] + ) + tm.assert_frame_equal(left, right) + + idx = pd.MultiIndex.from_arrays( + [ + ["cat", "cat", "cat", "dog", "dog"], + ["a", "a", "b", "a", "b"], + [1, 2, 1, 1, np.nan], + ] + ) + ts = pd.Series([1.0, 1.1, 1.2, 1.3, 1.4], index=idx) + right = DataFrame( + [[1.0, 1.3], [1.1, np.nan], [np.nan, 1.4], [1.2, np.nan]], + columns=["cat", "dog"], + ) + tpls = [("a", 1), ("a", 2), ("b", np.nan), ("b", 1)] + right.index = pd.MultiIndex.from_tuples(tpls) + tm.assert_frame_equal(ts.unstack(level=0), right) + + +def test_unstack_tuplename_in_multiindex(): + # GH 19966 + idx = pd.MultiIndex.from_product( + [["a", "b", "c"], [1, 2, 3]], names=[("A", "a"), ("B", "b")] + ) + ser = pd.Series(1, index=idx) + result = ser.unstack(("A", "a")) + + expected = pd.DataFrame( + [[1, 1, 1], [1, 1, 1], [1, 1, 1]], + columns=pd.MultiIndex.from_tuples( + [("a",), ("b",), ("c",)], names=[("A", "a")], + ), + index=pd.Index([1, 2, 3], name=("B", "b")), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "unstack_idx, expected_values, expected_index, expected_columns", + [ + ( + ("A", "a"), + [[1, 1], [1, 1], [1, 1], [1, 1]], + pd.MultiIndex.from_tuples( + [(1, 3), (1, 4), (2, 3), (2, 4)], names=["B", "C"] + ), + pd.MultiIndex.from_tuples([("a",), ("b",)], names=[("A", "a")]), + ), + ( + (("A", "a"), "B"), + [[1, 1, 1, 1], [1, 1, 1, 1]], + pd.Index([3, 4], name="C"), + pd.MultiIndex.from_tuples( + [("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=[("A", "a"), "B"] + ), + ), + ], +) +def test_unstack_mixed_type_name_in_multiindex( + unstack_idx, expected_values, expected_index, expected_columns +): + # GH 19966 + idx = pd.MultiIndex.from_product( + [["a", "b"], [1, 2], [3, 4]], names=[("A", "a"), "B", "C"] + ) + ser = pd.Series(1, index=idx) + result = ser.unstack(unstack_idx) + + expected = pd.DataFrame( + expected_values, columns=expected_columns, index=expected_index, + ) + tm.assert_frame_equal(result, expected) From 9a01577a129e9e57f4b2b3740cfb7cfd4e63783f Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Tue, 21 Jan 2020 00:31:19 +0100 Subject: [PATCH 133/158] REF: Move generic methods to aggregation.py (#30856) --- pandas/core/aggregation.py | 198 +++++++++++++++++ pandas/core/groupby/generic.py | 201 +----------------- .../tests/groupby/aggregate/test_aggregate.py | 81 ------- pandas/tests/test_aggregation.py | 90 ++++++++ 4 files changed, 298 insertions(+), 272 deletions(-) create mode 100644 pandas/core/aggregation.py create mode 100644 pandas/tests/test_aggregation.py diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py new file mode 100644 index 0000000000000..79b87f146b9a7 --- /dev/null +++ b/pandas/core/aggregation.py @@ -0,0 +1,198 @@ +""" +aggregation.py contains utility functions to handle multiple named and lambda +kwarg aggregations in groupby and DataFrame/Series aggregation +""" + +from collections import defaultdict +from functools import partial +from typing import Any, DefaultDict, List, Sequence, Tuple + +from pandas.core.dtypes.common import is_dict_like, is_list_like + +import pandas.core.common as com +from pandas.core.indexes.api import Index + + +def is_multi_agg_with_relabel(**kwargs) -> bool: + """ + Check whether kwargs passed to .agg look like multi-agg with relabeling. + + Parameters + ---------- + **kwargs : dict + + Returns + ------- + bool + + Examples + -------- + >>> is_multi_agg_with_relabel(a='max') + False + >>> is_multi_agg_with_relabel(a_max=('a', 'max'), + ... a_min=('a', 'min')) + True + >>> is_multi_agg_with_relabel() + False + """ + return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and ( + len(kwargs) > 0 + ) + + +def normalize_keyword_aggregation(kwargs: dict) -> Tuple[dict, List[str], List[int]]: + """ + Normalize user-provided "named aggregation" kwargs. + Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs + to the old Dict[str, List[scalar]]]. + + Parameters + ---------- + kwargs : dict + + Returns + ------- + aggspec : dict + The transformed kwargs. + columns : List[str] + The user-provided keys. + col_idx_order : List[int] + List of columns indices. + + Examples + -------- + >>> normalize_keyword_aggregation({'output': ('input', 'sum')}) + ({'input': ['sum']}, ('output',), [('input', 'sum')]) + """ + # Normalize the aggregation functions as Mapping[column, List[func]], + # process normally, then fixup the names. + # TODO: aggspec type: typing.Dict[str, List[AggScalar]] + # May be hitting https://github.com/python/mypy/issues/5958 + # saying it doesn't have an attribute __name__ + aggspec: DefaultDict = defaultdict(list) + order = [] + columns, pairs = list(zip(*kwargs.items())) + + for name, (column, aggfunc) in zip(columns, pairs): + aggspec[column].append(aggfunc) + order.append((column, com.get_callable_name(aggfunc) or aggfunc)) + + # uniquify aggfunc name if duplicated in order list + uniquified_order = _make_unique_kwarg_list(order) + + # GH 25719, due to aggspec will change the order of assigned columns in aggregation + # uniquified_aggspec will store uniquified order list and will compare it with order + # based on index + aggspec_order = [ + (column, com.get_callable_name(aggfunc) or aggfunc) + for column, aggfuncs in aggspec.items() + for aggfunc in aggfuncs + ] + uniquified_aggspec = _make_unique_kwarg_list(aggspec_order) + + # get the new indice of columns by comparison + col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order) + return aggspec, columns, col_idx_order + + +def _make_unique_kwarg_list( + seq: Sequence[Tuple[Any, Any]] +) -> Sequence[Tuple[Any, Any]]: + """Uniquify aggfunc name of the pairs in the order list + + Examples: + -------- + >>> kwarg_list = [('a', ''), ('a', ''), ('b', '')] + >>> _make_unique_kwarg_list(kwarg_list) + [('a', '_0'), ('a', '_1'), ('b', '')] + """ + return [ + (pair[0], "_".join([pair[1], str(seq[:i].count(pair))])) + if seq.count(pair) > 1 + else pair + for i, pair in enumerate(seq) + ] + + +# TODO: Can't use, because mypy doesn't like us setting __name__ +# error: "partial[Any]" has no attribute "__name__" +# the type is: +# typing.Sequence[Callable[..., ScalarResult]] +# -> typing.Sequence[Callable[..., ScalarResult]]: + + +def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: + """ + Possibly mangle a list of aggfuncs. + + Parameters + ---------- + aggfuncs : Sequence + + Returns + ------- + mangled: list-like + A new AggSpec sequence, where lambdas have been converted + to have unique names. + + Notes + ----- + If just one aggfunc is passed, the name will not be mangled. + """ + if len(aggfuncs) <= 1: + # don't mangle for .agg([lambda x: .]) + return aggfuncs + i = 0 + mangled_aggfuncs = [] + for aggfunc in aggfuncs: + if com.get_callable_name(aggfunc) == "": + aggfunc = partial(aggfunc) + aggfunc.__name__ = f"" + i += 1 + mangled_aggfuncs.append(aggfunc) + + return mangled_aggfuncs + + +def maybe_mangle_lambdas(agg_spec: Any) -> Any: + """ + Make new lambdas with unique names. + + Parameters + ---------- + agg_spec : Any + An argument to GroupBy.agg. + Non-dict-like `agg_spec` are pass through as is. + For dict-like `agg_spec` a new spec is returned + with name-mangled lambdas. + + Returns + ------- + mangled : Any + Same type as the input. + + Examples + -------- + >>> maybe_mangle_lambdas('sum') + 'sum' + >>> maybe_mangle_lambdas([lambda: 1, lambda: 2]) # doctest: +SKIP + [, + .f(*args, **kwargs)>] + """ + is_dict = is_dict_like(agg_spec) + if not (is_dict or is_list_like(agg_spec)): + return agg_spec + mangled_aggspec = type(agg_spec)() # dict or OrderdDict + + if is_dict: + for key, aggfuncs in agg_spec.items(): + if is_list_like(aggfuncs) and not is_dict_like(aggfuncs): + mangled_aggfuncs = _managle_lambda_list(aggfuncs) + else: + mangled_aggfuncs = aggfuncs + + mangled_aggspec[key] = mangled_aggfuncs + else: + mangled_aggspec = _managle_lambda_list(agg_spec) + + return mangled_aggspec diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c49677fa27a31..98cdcd0f2b6ee 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -5,7 +5,7 @@ These are user facing as the result of the ``df.groupby(...)`` operations, which here returns a DataFrameGroupBy object. """ -from collections import abc, defaultdict, namedtuple +from collections import abc, namedtuple import copy from functools import partial from textwrap import dedent @@ -42,10 +42,8 @@ ensure_int64, ensure_platform_int, is_bool, - is_dict_like, is_integer_dtype, is_interval_dtype, - is_list_like, is_numeric_dtype, is_object_dtype, is_scalar, @@ -53,6 +51,11 @@ ) from pandas.core.dtypes.missing import _isna_ndarraylike, isna, notna +from pandas.core.aggregation import ( + is_multi_agg_with_relabel, + maybe_mangle_lambdas, + normalize_keyword_aggregation, +) import pandas.core.algorithms as algorithms from pandas.core.base import DataError, SpecificationError import pandas.core.common as com @@ -249,7 +252,7 @@ def aggregate(self, func=None, *args, **kwargs): elif isinstance(func, abc.Iterable): # Catch instances of lists / tuples # but not the class list / tuple itself. - func = _maybe_mangle_lambdas(func) + func = maybe_mangle_lambdas(func) ret = self._aggregate_multiple_funcs(func) if relabeling: ret.columns = columns @@ -918,9 +921,9 @@ class DataFrameGroupBy(GroupBy): @Appender(_shared_docs["aggregate"]) def aggregate(self, func=None, *args, **kwargs): - relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) + relabeling = func is None and is_multi_agg_with_relabel(**kwargs) if relabeling: - func, columns, order = _normalize_keyword_aggregation(kwargs) + func, columns, order = normalize_keyword_aggregation(kwargs) kwargs = {} elif isinstance(func, list) and len(func) > len(set(func)): @@ -935,7 +938,7 @@ def aggregate(self, func=None, *args, **kwargs): # nicer error message raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") - func = _maybe_mangle_lambdas(func) + func = maybe_mangle_lambdas(func) result, how = self._aggregate(func, *args, **kwargs) if how is None: @@ -1860,190 +1863,6 @@ def groupby_series(obj, col=None): boxplot = boxplot_frame_groupby -def _is_multi_agg_with_relabel(**kwargs) -> bool: - """ - Check whether kwargs passed to .agg look like multi-agg with relabeling. - - Parameters - ---------- - **kwargs : dict - - Returns - ------- - bool - - Examples - -------- - >>> _is_multi_agg_with_relabel(a='max') - False - >>> _is_multi_agg_with_relabel(a_max=('a', 'max'), - ... a_min=('a', 'min')) - True - >>> _is_multi_agg_with_relabel() - False - """ - return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and ( - len(kwargs) > 0 - ) - - -def _normalize_keyword_aggregation(kwargs): - """ - Normalize user-provided "named aggregation" kwargs. - - Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs - to the old Dict[str, List[scalar]]]. - - Parameters - ---------- - kwargs : dict - - Returns - ------- - aggspec : dict - The transformed kwargs. - columns : List[str] - The user-provided keys. - col_idx_order : List[int] - List of columns indices. - - Examples - -------- - >>> _normalize_keyword_aggregation({'output': ('input', 'sum')}) - ({'input': ['sum']}, ('output',), [('input', 'sum')]) - """ - # Normalize the aggregation functions as Mapping[column, List[func]], - # process normally, then fixup the names. - # TODO: aggspec type: typing.Dict[str, List[AggScalar]] - # May be hitting https://github.com/python/mypy/issues/5958 - # saying it doesn't have an attribute __name__ - aggspec = defaultdict(list) - order = [] - columns, pairs = list(zip(*kwargs.items())) - - for name, (column, aggfunc) in zip(columns, pairs): - aggspec[column].append(aggfunc) - order.append((column, com.get_callable_name(aggfunc) or aggfunc)) - - # uniquify aggfunc name if duplicated in order list - uniquified_order = _make_unique(order) - - # GH 25719, due to aggspec will change the order of assigned columns in aggregation - # uniquified_aggspec will store uniquified order list and will compare it with order - # based on index - aggspec_order = [ - (column, com.get_callable_name(aggfunc) or aggfunc) - for column, aggfuncs in aggspec.items() - for aggfunc in aggfuncs - ] - uniquified_aggspec = _make_unique(aggspec_order) - - # get the new indice of columns by comparison - col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order) - return aggspec, columns, col_idx_order - - -def _make_unique(seq): - """Uniquify aggfunc name of the pairs in the order list - - Examples: - -------- - >>> _make_unique([('a', ''), ('a', ''), ('b', '')]) - [('a', '_0'), ('a', '_1'), ('b', '')] - """ - return [ - (pair[0], "_".join([pair[1], str(seq[:i].count(pair))])) - if seq.count(pair) > 1 - else pair - for i, pair in enumerate(seq) - ] - - -# TODO: Can't use, because mypy doesn't like us setting __name__ -# error: "partial[Any]" has no attribute "__name__" -# the type is: -# typing.Sequence[Callable[..., ScalarResult]] -# -> typing.Sequence[Callable[..., ScalarResult]]: - - -def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: - """ - Possibly mangle a list of aggfuncs. - - Parameters - ---------- - aggfuncs : Sequence - - Returns - ------- - mangled: list-like - A new AggSpec sequence, where lambdas have been converted - to have unique names. - - Notes - ----- - If just one aggfunc is passed, the name will not be mangled. - """ - if len(aggfuncs) <= 1: - # don't mangle for .agg([lambda x: .]) - return aggfuncs - i = 0 - mangled_aggfuncs = [] - for aggfunc in aggfuncs: - if com.get_callable_name(aggfunc) == "": - aggfunc = partial(aggfunc) - aggfunc.__name__ = f"" - i += 1 - mangled_aggfuncs.append(aggfunc) - - return mangled_aggfuncs - - -def _maybe_mangle_lambdas(agg_spec: Any) -> Any: - """ - Make new lambdas with unique names. - - Parameters - ---------- - agg_spec : Any - An argument to GroupBy.agg. - Non-dict-like `agg_spec` are pass through as is. - For dict-like `agg_spec` a new spec is returned - with name-mangled lambdas. - - Returns - ------- - mangled : Any - Same type as the input. - - Examples - -------- - >>> _maybe_mangle_lambdas('sum') - 'sum' - - >>> _maybe_mangle_lambdas([lambda: 1, lambda: 2]) # doctest: +SKIP - [, - .f(*args, **kwargs)>] - """ - is_dict = is_dict_like(agg_spec) - if not (is_dict or is_list_like(agg_spec)): - return agg_spec - mangled_aggspec = type(agg_spec)() # dict or OrderdDict - - if is_dict: - for key, aggfuncs in agg_spec.items(): - if is_list_like(aggfuncs) and not is_dict_like(aggfuncs): - mangled_aggfuncs = _managle_lambda_list(aggfuncs) - else: - mangled_aggfuncs = aggfuncs - - mangled_aggspec[key] = mangled_aggfuncs - else: - mangled_aggspec = _managle_lambda_list(agg_spec) - - return mangled_aggspec - - def _recast_datetimelike_result(result: DataFrame) -> DataFrame: """ If we have date/time like in the original, then coerce dates diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 0b72a61ed84de..3d842aca210ed 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -10,7 +10,6 @@ from pandas import DataFrame, Index, MultiIndex, Series, concat import pandas._testing as tm from pandas.core.base import SpecificationError -from pandas.core.groupby.generic import _make_unique, _maybe_mangle_lambdas from pandas.core.groupby.grouper import Grouping @@ -632,41 +631,6 @@ def test_lambda_named_agg(func): class TestLambdaMangling: - def test_maybe_mangle_lambdas_passthrough(self): - assert _maybe_mangle_lambdas("mean") == "mean" - assert _maybe_mangle_lambdas(lambda x: x).__name__ == "" - # don't mangel single lambda. - assert _maybe_mangle_lambdas([lambda x: x])[0].__name__ == "" - - def test_maybe_mangle_lambdas_listlike(self): - aggfuncs = [lambda x: 1, lambda x: 2] - result = _maybe_mangle_lambdas(aggfuncs) - assert result[0].__name__ == "" - assert result[1].__name__ == "" - assert aggfuncs[0](None) == result[0](None) - assert aggfuncs[1](None) == result[1](None) - - def test_maybe_mangle_lambdas(self): - func = {"A": [lambda x: 0, lambda x: 1]} - result = _maybe_mangle_lambdas(func) - assert result["A"][0].__name__ == "" - assert result["A"][1].__name__ == "" - - def test_maybe_mangle_lambdas_args(self): - func = {"A": [lambda x, a, b=1: (0, a, b), lambda x: 1]} - result = _maybe_mangle_lambdas(func) - assert result["A"][0].__name__ == "" - assert result["A"][1].__name__ == "" - - assert func["A"][0](0, 1) == (0, 1, 1) - assert func["A"][0](0, 1, 2) == (0, 1, 2) - assert func["A"][0](0, 2, b=3) == (0, 2, 3) - - def test_maybe_mangle_lambdas_named(self): - func = {"C": np.mean, "D": {"foo": np.mean, "bar": np.mean}} - result = _maybe_mangle_lambdas(func) - assert result == func - def test_basic(self): df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) result = df.groupby("A").agg({"B": [lambda x: 0, lambda x: 1]}) @@ -784,48 +748,3 @@ def test_agg_multiple_lambda(self): weight_min=pd.NamedAgg(column="weight", aggfunc=lambda x: np.min(x)), ) tm.assert_frame_equal(result2, expected) - - @pytest.mark.parametrize( - "order, expected_reorder", - [ - ( - [ - ("height", ""), - ("height", "max"), - ("weight", "max"), - ("height", ""), - ("weight", ""), - ], - [ - ("height", "_0"), - ("height", "max"), - ("weight", "max"), - ("height", "_1"), - ("weight", ""), - ], - ), - ( - [ - ("col2", "min"), - ("col1", ""), - ("col1", ""), - ("col1", ""), - ], - [ - ("col2", "min"), - ("col1", "_0"), - ("col1", "_1"), - ("col1", "_2"), - ], - ), - ( - [("col", ""), ("col", ""), ("col", "")], - [("col", "_0"), ("col", "_1"), ("col", "_2")], - ), - ], - ) - def test_make_unique(self, order, expected_reorder): - # GH 27519, test if make_unique function reorders correctly - result = _make_unique(order) - - assert result == expected_reorder diff --git a/pandas/tests/test_aggregation.py b/pandas/tests/test_aggregation.py new file mode 100644 index 0000000000000..74ccebc8e2275 --- /dev/null +++ b/pandas/tests/test_aggregation.py @@ -0,0 +1,90 @@ +import numpy as np +import pytest + +from pandas.core.aggregation import _make_unique_kwarg_list, maybe_mangle_lambdas + + +def test_maybe_mangle_lambdas_passthrough(): + assert maybe_mangle_lambdas("mean") == "mean" + assert maybe_mangle_lambdas(lambda x: x).__name__ == "" + # don't mangel single lambda. + assert maybe_mangle_lambdas([lambda x: x])[0].__name__ == "" + + +def test_maybe_mangle_lambdas_listlike(): + aggfuncs = [lambda x: 1, lambda x: 2] + result = maybe_mangle_lambdas(aggfuncs) + assert result[0].__name__ == "" + assert result[1].__name__ == "" + assert aggfuncs[0](None) == result[0](None) + assert aggfuncs[1](None) == result[1](None) + + +def test_maybe_mangle_lambdas(): + func = {"A": [lambda x: 0, lambda x: 1]} + result = maybe_mangle_lambdas(func) + assert result["A"][0].__name__ == "" + assert result["A"][1].__name__ == "" + + +def test_maybe_mangle_lambdas_args(): + func = {"A": [lambda x, a, b=1: (0, a, b), lambda x: 1]} + result = maybe_mangle_lambdas(func) + assert result["A"][0].__name__ == "" + assert result["A"][1].__name__ == "" + + assert func["A"][0](0, 1) == (0, 1, 1) + assert func["A"][0](0, 1, 2) == (0, 1, 2) + assert func["A"][0](0, 2, b=3) == (0, 2, 3) + + +def test_maybe_mangle_lambdas_named(): + func = {"C": np.mean, "D": {"foo": np.mean, "bar": np.mean}} + result = maybe_mangle_lambdas(func) + assert result == func + + +@pytest.mark.parametrize( + "order, expected_reorder", + [ + ( + [ + ("height", ""), + ("height", "max"), + ("weight", "max"), + ("height", ""), + ("weight", ""), + ], + [ + ("height", "_0"), + ("height", "max"), + ("weight", "max"), + ("height", "_1"), + ("weight", ""), + ], + ), + ( + [ + ("col2", "min"), + ("col1", ""), + ("col1", ""), + ("col1", ""), + ], + [ + ("col2", "min"), + ("col1", "_0"), + ("col1", "_1"), + ("col1", "_2"), + ], + ), + ( + [("col", ""), ("col", ""), ("col", "")], + [("col", "_0"), ("col", "_1"), ("col", "_2")], + ), + ], +) +def test_make_unique(order, expected_reorder): + # GH 27519, test if make_unique function reorders correctly + result = _make_unique_kwarg_list(order) + + assert result == expected_reorder From f1bbb2107db10c60c68b440d84a1c487609874f4 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 21 Jan 2020 01:37:55 +0200 Subject: [PATCH 134/158] ENH: show percentiles in timestamp describe (#30164) (#30209) --- doc/source/whatsnew/v1.1.0.rst | 11 +++++ pandas/core/generic.py | 34 ++++++--------- pandas/tests/frame/methods/test_describe.py | 45 +++----------------- pandas/tests/series/methods/test_describe.py | 41 +++++++++++++++--- 4 files changed, 67 insertions(+), 64 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index e349745d51e83..a04ba157ce0ae 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -22,6 +22,17 @@ Other enhancements - - +.. --------------------------------------------------------------------------- + +.. _whatsnew_110.api.other: + +Other API changes +^^^^^^^^^^^^^^^^^ + +- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last`` + will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`) +- +- .. --------------------------------------------------------------------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7b216c53c68cf..6c04212e26924 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9567,26 +9567,8 @@ def describe_categorical_1d(data): dtype = None if result[1] > 0: top, freq = objcounts.index[0], objcounts.iloc[0] - - if is_datetime64_any_dtype(data): - tz = data.dt.tz - asint = data.dropna().values.view("i8") - top = Timestamp(top) - if top.tzinfo is not None and tz is not None: - # Don't tz_localize(None) if key is already tz-aware - top = top.tz_convert(tz) - else: - top = top.tz_localize(tz) - names += ["top", "freq", "first", "last"] - result += [ - top, - freq, - Timestamp(asint.min(), tz=tz), - Timestamp(asint.max(), tz=tz), - ] - else: - names += ["top", "freq"] - result += [top, freq] + names += ["top", "freq"] + result += [top, freq] # If the DataFrame is empty, set 'top' and 'freq' to None # to maintain output shape consistency @@ -9597,11 +9579,23 @@ def describe_categorical_1d(data): return pd.Series(result, index=names, name=data.name, dtype=dtype) + def describe_timestamp_1d(data): + # GH-30164 + stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"] + d = ( + [data.count(), data.mean(), data.min()] + + data.quantile(percentiles).tolist() + + [data.max()] + ) + return pd.Series(d, index=stat_index, name=data.name) + def describe_1d(data): if is_bool_dtype(data): return describe_categorical_1d(data) elif is_numeric_dtype(data): return describe_numeric_1d(data) + elif is_datetime64_any_dtype(data): + return describe_timestamp_1d(data) elif is_timedelta64_dtype(data): return describe_numeric_1d(data) else: diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index 251563e51e15a..127233ed2713e 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -253,52 +253,19 @@ def test_describe_tz_values(self, tz_naive_fixture): expected = DataFrame( { - "s1": [ - 5, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - 2, - 1.581139, - 0, - 1, - 2, - 3, - 4, - ], + "s1": [5, 2, 0, 1, 2, 3, 4, 1.581139], "s2": [ 5, - 5, - s2.value_counts().index[0], - 1, + Timestamp(2018, 1, 3).tz_localize(tz), start.tz_localize(tz), + s2[1], + s2[2], + s2[3], end.tz_localize(tz), np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, ], }, - index=[ - "count", - "unique", - "top", - "freq", - "first", - "last", - "mean", - "std", - "min", - "25%", - "50%", - "75%", - "max", - ], + index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"], ) result = df.describe(include="all") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_describe.py b/pandas/tests/series/methods/test_describe.py index b147a04b11090..4e59c6995f4f2 100644 --- a/pandas/tests/series/methods/test_describe.py +++ b/pandas/tests/series/methods/test_describe.py @@ -1,6 +1,6 @@ import numpy as np -from pandas import Series, Timestamp, date_range +from pandas import Period, Series, Timedelta, Timestamp, date_range import pandas._testing as tm @@ -29,6 +29,36 @@ def test_describe(self): ) tm.assert_series_equal(result, expected) + s = Series( + [ + Timedelta("1 days"), + Timedelta("2 days"), + Timedelta("3 days"), + Timedelta("4 days"), + Timedelta("5 days"), + ], + name="timedelta_data", + ) + result = s.describe() + expected = Series( + [5, s[2], s.std(), s[0], s[1], s[2], s[3], s[4]], + name="timedelta_data", + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_series_equal(result, expected) + + s = Series( + [Period("2020-01", "M"), Period("2020-01", "M"), Period("2019-12", "M")], + name="period_data", + ) + result = s.describe() + expected = Series( + [3, 2, s[0], 2], + name="period_data", + index=["count", "unique", "top", "freq"], + ) + tm.assert_series_equal(result, expected) + def test_describe_empty_object(self): # https://github.com/pandas-dev/pandas/issues/27183 s = Series([None, None], dtype=object) @@ -57,13 +87,14 @@ def test_describe_with_tz(self, tz_naive_fixture): expected = Series( [ 5, - 5, - s.value_counts().index[0], - 1, + Timestamp(2018, 1, 3).tz_localize(tz), start.tz_localize(tz), + s[1], + s[2], + s[3], end.tz_localize(tz), ], name=name, - index=["count", "unique", "top", "freq", "first", "last"], + index=["count", "mean", "min", "25%", "50%", "75%", "max"], ) tm.assert_series_equal(result, expected) From 8a8e96757adc6d64348929d523326d8e32e8c834 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 20 Jan 2020 15:44:48 -0800 Subject: [PATCH 135/158] Split out JSON Date Converters (#31057) --- .../_libs/src/ujson/python/date_conversions.c | 118 +++++++++++++++++ .../_libs/src/ujson/python/date_conversions.h | 31 +++++ pandas/_libs/src/ujson/python/objToJSON.c | 125 +----------------- setup.py | 7 +- 4 files changed, 158 insertions(+), 123 deletions(-) create mode 100644 pandas/_libs/src/ujson/python/date_conversions.c create mode 100644 pandas/_libs/src/ujson/python/date_conversions.h diff --git a/pandas/_libs/src/ujson/python/date_conversions.c b/pandas/_libs/src/ujson/python/date_conversions.c new file mode 100644 index 0000000000000..fc4bdef8463af --- /dev/null +++ b/pandas/_libs/src/ujson/python/date_conversions.c @@ -0,0 +1,118 @@ +// Conversion routines that are useful for serialization, +// but which don't interact with JSON objects directly + +#include "date_conversions.h" +#include <../../../tslibs/src/datetime/np_datetime.h> +#include <../../../tslibs/src/datetime/np_datetime_strings.h> + +/* + * Function: scaleNanosecToUnit + * ----------------------------- + * + * Scales an integer value representing time in nanoseconds to provided unit. + * + * Mutates the provided value directly. Returns 0 on success, non-zero on error. + */ +int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit) { + switch (unit) { + case NPY_FR_ns: + break; + case NPY_FR_us: + *value /= 1000LL; + break; + case NPY_FR_ms: + *value /= 1000000LL; + break; + case NPY_FR_s: + *value /= 1000000000LL; + break; + default: + return -1; + } + + return 0; +} + +/* Converts the int64_t representation of a datetime to ISO; mutates len */ +char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) { + npy_datetimestruct dts; + int ret_code; + + pandas_datetime_to_datetimestruct(value, NPY_FR_ns, &dts); + + *len = (size_t)get_datetime_iso_8601_strlen(0, base); + char *result = PyObject_Malloc(*len); + + if (result == NULL) { + PyErr_NoMemory(); + return NULL; + } + + ret_code = make_iso_8601_datetime(&dts, result, *len, base); + if (ret_code != 0) { + PyErr_SetString(PyExc_ValueError, + "Could not convert datetime value to string"); + PyObject_Free(result); + } + + // Note that get_datetime_iso_8601_strlen just gives a generic size + // for ISO string conversion, not the actual size used + *len = strlen(result); + return result; +} + +npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base) { + scaleNanosecToUnit(&dt, base); + return dt; +} + +/* Convert PyDatetime To ISO C-string. mutates len */ +char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base, + size_t *len) { + npy_datetimestruct dts; + int ret; + + ret = convert_pydatetime_to_datetimestruct(obj, &dts); + if (ret != 0) { + if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_ValueError, + "Could not convert PyDateTime to numpy datetime"); + } + return NULL; + } + + *len = (size_t)get_datetime_iso_8601_strlen(0, base); + char *result = PyObject_Malloc(*len); + ret = make_iso_8601_datetime(&dts, result, *len, base); + + if (ret != 0) { + PyErr_SetString(PyExc_ValueError, + "Could not convert datetime value to string"); + PyObject_Free(result); + return NULL; + } + + // Note that get_datetime_iso_8601_strlen just gives a generic size + // for ISO string conversion, not the actual size used + *len = strlen(result); + return result; +} + +npy_datetime PyDateTimeToEpoch(PyDateTime_Date *dt, NPY_DATETIMEUNIT base) { + npy_datetimestruct dts; + int ret; + + ret = convert_pydatetime_to_datetimestruct(dt, &dts); + if (ret != 0) { + if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_ValueError, + "Could not convert PyDateTime to numpy datetime"); + } + // TODO: is setting errMsg required? + //((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; + // return NULL; + } + + npy_datetime npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts); + return NpyDateTimeToEpoch(npy_dt, base); +} diff --git a/pandas/_libs/src/ujson/python/date_conversions.h b/pandas/_libs/src/ujson/python/date_conversions.h new file mode 100644 index 0000000000000..45455f4d6128b --- /dev/null +++ b/pandas/_libs/src/ujson/python/date_conversions.h @@ -0,0 +1,31 @@ +#ifndef PANDAS__LIBS_SRC_UJSON_DATE_CONVERSIONS +#define PANDAS__LIBS_SRC_UJSON_DATE_CONVERSIONS + +#define PY_SSIZE_T_CLEAN +#include +#include +#include "datetime.h" + +// Scales value inplace from nanosecond resolution to unit resolution +int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit); + +// Converts an int64 object representing a date to ISO format +// up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z +// while base="ns" yields "2020-01-01T00:00:00.000000000Z" +// len is mutated to save the length of the returned string +char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len); + +// TODO: this function doesn't do a lot; should augment or replace with +// scaleNanosecToUnit +npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base); + +// Converts a Python object representing a Date / Datetime to ISO format +// up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z +// while base="ns" yields "2020-01-01T00:00:00.000000000Z" +// len is mutated to save the length of the returned string +char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base, size_t *len); + +// Convert a Python Date/Datetime to Unix epoch with resolution base +npy_datetime PyDateTimeToEpoch(PyDateTime_Date *dt, NPY_DATETIMEUNIT base); + +#endif diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index c5ac279ed3243..0367661e5c554 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -45,8 +45,7 @@ Numeric decoder derived from from TCL library #include #include #include -#include <../../../tslibs/src/datetime/np_datetime.h> -#include <../../../tslibs/src/datetime/np_datetime_strings.h> +#include "date_conversions.h" #include "datetime.h" static PyTypeObject *type_decimal; @@ -209,34 +208,6 @@ static TypeContext *createTypeContext(void) { return pc; } -/* - * Function: scaleNanosecToUnit - * ----------------------------- - * - * Scales an integer value representing time in nanoseconds to provided unit. - * - * Mutates the provided value directly. Returns 0 on success, non-zero on error. - */ -static int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit) { - switch (unit) { - case NPY_FR_ns: - break; - case NPY_FR_us: - *value /= 1000LL; - break; - case NPY_FR_ms: - *value /= 1000000LL; - break; - case NPY_FR_s: - *value /= 1000000000LL; - break; - default: - return -1; - } - - return 0; -} - static PyObject *get_values(PyObject *obj) { PyObject *values = NULL; @@ -379,34 +350,6 @@ static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc), return (char *)PyUnicode_AsUTF8AndSize(_obj, (Py_ssize_t *)_outLen); } -/* Converts the int64_t representation of a datetime to ISO; mutates len */ -static char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) { - npy_datetimestruct dts; - int ret_code; - - pandas_datetime_to_datetimestruct(value, NPY_FR_ns, &dts); - - *len = (size_t)get_datetime_iso_8601_strlen(0, base); - char *result = PyObject_Malloc(*len); - - if (result == NULL) { - PyErr_NoMemory(); - return NULL; - } - - ret_code = make_iso_8601_datetime(&dts, result, *len, base); - if (ret_code != 0) { - PyErr_SetString(PyExc_ValueError, - "Could not convert datetime value to string"); - PyObject_Free(result); - } - - // Note that get_datetime_iso_8601_strlen just gives a generic size - // for ISO string conversion, not the actual size used - *len = strlen(result); - return result; -} - /* JSON callback. returns a char* and mutates the pointer to *len */ static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), JSONTypeContext *tc, size_t *len) { @@ -414,44 +357,6 @@ static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), return int64ToIso(GET_TC(tc)->longValue, base, len); } -static npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base) { - scaleNanosecToUnit(&dt, base); - return dt; -} - -/* Convert PyDatetime To ISO C-string. mutates len */ -static char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base, - size_t *len) { - npy_datetimestruct dts; - int ret; - - ret = convert_pydatetime_to_datetimestruct(obj, &dts); - if (ret != 0) { - if (!PyErr_Occurred()) { - PyErr_SetString(PyExc_ValueError, - "Could not convert PyDateTime to numpy datetime"); - } - return NULL; - } - - *len = (size_t)get_datetime_iso_8601_strlen(0, base); - char *result = PyObject_Malloc(*len); - ret = make_iso_8601_datetime(&dts, result, *len, base); - - if (ret != 0) { - PRINTMARK(); - PyErr_SetString(PyExc_ValueError, - "Could not convert datetime value to string"); - PyObject_Free(result); - return NULL; - } - - // Note that get_datetime_iso_8601_strlen just gives a generic size - // for ISO string conversion, not the actual size used - *len = strlen(result); - return result; -} - /* JSON callback */ static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, size_t *len) { @@ -465,30 +370,6 @@ static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, return PyDateTimeToIso(obj, base, len); } -static npy_datetime PyDateTimeToEpoch(PyObject *obj, NPY_DATETIMEUNIT base) { - npy_datetimestruct dts; - int ret; - - if (!PyDate_Check(obj)) { - // TODO: raise TypeError - } - PyDateTime_Date *dt = (PyDateTime_Date *)obj; - - ret = convert_pydatetime_to_datetimestruct(dt, &dts); - if (ret != 0) { - if (!PyErr_Occurred()) { - PyErr_SetString(PyExc_ValueError, - "Could not convert PyDateTime to numpy datetime"); - } - // TODO: is setting errMsg required? - //((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; - // return NULL; - } - - npy_datetime npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts); - return NpyDateTimeToEpoch(npy_dt, base); -} - static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) { PyObject *obj = (PyObject *)_obj; PyObject *str; @@ -1814,7 +1695,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { PRINTMARK(); NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - GET_TC(tc)->longValue = PyDateTimeToEpoch(obj, base); + GET_TC(tc)->longValue = PyDateTimeToEpoch((PyDateTime_Date *)obj, base); tc->type = JT_LONG; } return; @@ -1840,7 +1721,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { PRINTMARK(); NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - GET_TC(tc)->longValue = PyDateTimeToEpoch(obj, base); + GET_TC(tc)->longValue = PyDateTimeToEpoch((PyDateTime_Date *)obj, base); tc->type = JT_LONG; } return; diff --git a/setup.py b/setup.py index 191fe49d1eb89..c7dbde2ff862c 100755 --- a/setup.py +++ b/setup.py @@ -240,6 +240,7 @@ def initialize_options(self): pjoin(ujson_python, "ujson.c"), pjoin(ujson_python, "objToJSON.c"), pjoin(ujson_python, "JSONtoObj.c"), + pjoin(ujson_python, "date_conversions.c"), pjoin(ujson_lib, "ultrajsonenc.c"), pjoin(ujson_lib, "ultrajsondec.c"), pjoin(util, "move.c"), @@ -714,11 +715,15 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): ujson_ext = Extension( "pandas._libs.json", - depends=["pandas/_libs/src/ujson/lib/ultrajson.h"], + depends=[ + "pandas/_libs/src/ujson/lib/ultrajson.h", + "pandas/_libs/src/ujson/python/date_conversions.h", + ], sources=( [ "pandas/_libs/src/ujson/python/ujson.c", "pandas/_libs/src/ujson/python/objToJSON.c", + "pandas/_libs/src/ujson/python/date_conversions.c", "pandas/_libs/src/ujson/python/JSONtoObj.c", "pandas/_libs/src/ujson/lib/ultrajsonenc.c", "pandas/_libs/src/ujson/lib/ultrajsondec.c", From cdffa43af34aa832105dd3787eec20ea0222af54 Mon Sep 17 00:00:00 2001 From: Rik-de-Kort <32839123+Rik-de-Kort@users.noreply.github.com> Date: Tue, 21 Jan 2020 00:49:13 +0100 Subject: [PATCH 136/158] ENH: XLSB support (#29836) --- ci/deps/azure-37-locale.yaml | 3 + ci/deps/azure-macos-36.yaml | 1 + ci/deps/azure-windows-37.yaml | 3 + ci/deps/travis-36-cov.yaml | 1 + doc/source/getting_started/install.rst | 1 + doc/source/user_guide/io.rst | 29 +++++++- doc/source/whatsnew/v1.0.0.rst | 3 +- pandas/compat/_optional.py | 1 + pandas/core/config_init.py | 8 +++ pandas/io/excel/_base.py | 17 +++-- pandas/io/excel/_pyxlsb.py | 68 ++++++++++++++++++ pandas/tests/io/data/excel/blank.xlsb | Bin 0 -> 8908 bytes .../io/data/excel/blank_with_header.xlsb | Bin 0 -> 9129 bytes pandas/tests/io/data/excel/test1.xlsb | Bin 0 -> 11359 bytes pandas/tests/io/data/excel/test2.xlsb | Bin 0 -> 7579 bytes pandas/tests/io/data/excel/test3.xlsb | Bin 0 -> 7553 bytes pandas/tests/io/data/excel/test4.xlsb | Bin 0 -> 7646 bytes pandas/tests/io/data/excel/test5.xlsb | Bin 0 -> 7824 bytes .../tests/io/data/excel/test_converters.xlsb | Bin 0 -> 7810 bytes .../io/data/excel/test_index_name_pre17.xlsb | Bin 0 -> 11097 bytes .../tests/io/data/excel/test_multisheet.xlsb | Bin 0 -> 10707 bytes pandas/tests/io/data/excel/test_squeeze.xlsb | Bin 0 -> 8567 bytes pandas/tests/io/data/excel/test_types.xlsb | Bin 0 -> 8053 bytes .../tests/io/data/excel/testdateoverflow.xlsb | Bin 0 -> 9856 bytes pandas/tests/io/data/excel/testdtype.xlsb | Bin 0 -> 7697 bytes .../tests/io/data/excel/testmultiindex.xlsb | Bin 0 -> 18853 bytes pandas/tests/io/data/excel/testskiprows.xlsb | Bin 0 -> 7699 bytes pandas/tests/io/data/excel/times_1900.xlsb | Bin 0 -> 7773 bytes pandas/tests/io/data/excel/times_1904.xlsb | Bin 0 -> 7734 bytes pandas/tests/io/excel/conftest.py | 2 +- pandas/tests/io/excel/test_readers.py | 58 +++++++++++++-- pandas/tests/io/excel/test_xlrd.py | 4 +- 32 files changed, 185 insertions(+), 14 deletions(-) create mode 100644 pandas/io/excel/_pyxlsb.py create mode 100644 pandas/tests/io/data/excel/blank.xlsb create mode 100644 pandas/tests/io/data/excel/blank_with_header.xlsb create mode 100644 pandas/tests/io/data/excel/test1.xlsb create mode 100644 pandas/tests/io/data/excel/test2.xlsb create mode 100644 pandas/tests/io/data/excel/test3.xlsb create mode 100644 pandas/tests/io/data/excel/test4.xlsb create mode 100644 pandas/tests/io/data/excel/test5.xlsb create mode 100644 pandas/tests/io/data/excel/test_converters.xlsb create mode 100644 pandas/tests/io/data/excel/test_index_name_pre17.xlsb create mode 100644 pandas/tests/io/data/excel/test_multisheet.xlsb create mode 100644 pandas/tests/io/data/excel/test_squeeze.xlsb create mode 100644 pandas/tests/io/data/excel/test_types.xlsb create mode 100644 pandas/tests/io/data/excel/testdateoverflow.xlsb create mode 100644 pandas/tests/io/data/excel/testdtype.xlsb create mode 100644 pandas/tests/io/data/excel/testmultiindex.xlsb create mode 100644 pandas/tests/io/data/excel/testskiprows.xlsb create mode 100644 pandas/tests/io/data/excel/times_1900.xlsb create mode 100644 pandas/tests/io/data/excel/times_1904.xlsb diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 111ba6b020bc7..dc51597a33209 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -34,3 +34,6 @@ dependencies: - xlsxwriter - xlwt - pyarrow>=0.15 + - pip + - pip: + - pyxlsb diff --git a/ci/deps/azure-macos-36.yaml b/ci/deps/azure-macos-36.yaml index 3bbbdb4cf32ad..90980133b31c1 100644 --- a/ci/deps/azure-macos-36.yaml +++ b/ci/deps/azure-macos-36.yaml @@ -33,3 +33,4 @@ dependencies: - pip - pip: - pyreadstat + - pyxlsb diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 62be1075b3337..6b3ad6f560292 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -35,3 +35,6 @@ dependencies: - xlsxwriter - xlwt - pyreadstat + - pip + - pip: + - pyxlsb diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index a46001c58d165..869d2ab683f0c 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -51,3 +51,4 @@ dependencies: - coverage - pandas-datareader - python-dateutil + - pyxlsb diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index b3fd443e662a9..b5c512cdc8328 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -264,6 +264,7 @@ pyarrow 0.12.0 Parquet, ORC (requires 0.13.0), and pymysql 0.7.11 MySQL engine for sqlalchemy pyreadstat SPSS files (.sav) reading pytables 3.4.2 HDF5 reading / writing +pyxlsb 1.0.5 Reading for xlsb files qtpy Clipboard I/O s3fs 0.3.0 Amazon S3 access tabulate 0.8.3 Printing in Markdown-friendly format (see `tabulate`_) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index e776da016d5d7..d0780e4ab8dba 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -23,7 +23,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like text;`JSON `__;:ref:`read_json`;:ref:`to_json` text;`HTML `__;:ref:`read_html`;:ref:`to_html` text; Local clipboard;:ref:`read_clipboard`;:ref:`to_clipboard` - binary;`MS Excel `__;:ref:`read_excel`;:ref:`to_excel` + ;`MS Excel `__;:ref:`read_excel`;:ref:`to_excel` binary;`OpenDocument `__;:ref:`read_excel`; binary;`HDF5 Format `__;:ref:`read_hdf`;:ref:`to_hdf` binary;`Feather Format `__;:ref:`read_feather`;:ref:`to_feather` @@ -2768,7 +2768,8 @@ Excel files The :func:`~pandas.read_excel` method can read Excel 2003 (``.xls``) files using the ``xlrd`` Python module. Excel 2007+ (``.xlsx``) files -can be read using either ``xlrd`` or ``openpyxl``. +can be read using either ``xlrd`` or ``openpyxl``. Binary Excel (``.xlsb``) +files can be read using ``pyxlsb``. The :meth:`~DataFrame.to_excel` instance method is used for saving a ``DataFrame`` to Excel. Generally the semantics are similar to working with :ref:`csv` data. @@ -3229,6 +3230,30 @@ OpenDocument spreadsheets match what can be done for `Excel files`_ using Currently pandas only supports *reading* OpenDocument spreadsheets. Writing is not implemented. +.. _io.xlsb: + +Binary Excel (.xlsb) files +-------------------------- + +.. versionadded:: 1.0.0 + +The :func:`~pandas.read_excel` method can also read binary Excel files +using the ``pyxlsb`` module. The semantics and features for reading +binary Excel files mostly match what can be done for `Excel files`_ using +``engine='pyxlsb'``. ``pyxlsb`` does not recognize datetime types +in files and will return floats instead. + +.. code-block:: python + + # Returns a DataFrame + pd.read_excel('path_to_file.xlsb', engine='pyxlsb') + +.. note:: + + Currently pandas only supports *reading* binary Excel files. Writing + is not implemented. + + .. _io.clipboard: Clipboard diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 3bd86bb02155f..ec6ad38bbc7cf 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -215,7 +215,8 @@ Other enhancements - :meth:`Styler.format` added the ``na_rep`` parameter to help format the missing values (:issue:`21527`, :issue:`28358`) - Roundtripping DataFrames with nullable integer, string and period data types to parquet (:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine - now preserve those data types with pyarrow >= 0.16.0 (:issue:`20612`, :issue:`28371`). + now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`). +- :func:`read_excel` now can read binary Excel (``.xlsb``) files by passing ``engine='pyxlsb'``. For more details and example usage, see the :ref:`Binary Excel files documentation `. Closes :issue:`8540`. - The ``partition_cols`` argument in :meth:`DataFrame.to_parquet` now accepts a string (:issue:`27117`) - :func:`pandas.read_json` now parses ``NaN``, ``Infinity`` and ``-Infinity`` (:issue:`12213`) - :func:`to_parquet` now appropriately handles the ``schema`` argument for user defined schemas in the pyarrow engine. (:issue:`30270`) diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 7aeb0327139f1..d561ab9a10548 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -19,6 +19,7 @@ "pyarrow": "0.13.0", "pytables": "3.4.2", "pytest": "5.0.1", + "pyxlsb": "1.0.5", "s3fs": "0.3.0", "scipy": "0.19.0", "sqlalchemy": "1.1.4", diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index afdd8a01ee003..eb1587313910d 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -479,6 +479,7 @@ def use_inf_as_na_cb(key): _xlsm_options = ["xlrd", "openpyxl"] _xlsx_options = ["xlrd", "openpyxl"] _ods_options = ["odf"] +_xlsb_options = ["pyxlsb"] with cf.config_prefix("io.excel.xls"): @@ -515,6 +516,13 @@ def use_inf_as_na_cb(key): validator=str, ) +with cf.config_prefix("io.excel.xlsb"): + cf.register_option( + "reader", + "auto", + reader_engine_doc.format(ext="xlsb", others=", ".join(_xlsb_options)), + validator=str, + ) # Set up the io.excel specific writer configuration. writer_engine_doc = """ diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 04015a08bce2f..2a91381b7fbeb 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -35,8 +35,9 @@ """ Read an Excel file into a pandas DataFrame. -Support both `xls` and `xlsx` file extensions from a local filesystem or URL. -Support an option to read a single sheet or a list of sheets. +Supports `xls`, `xlsx`, `xlsm`, `xlsb`, and `odf` file extensions +read from a local filesystem or URL. Supports an option to read +a single sheet or a list of sheets. Parameters ---------- @@ -789,15 +790,21 @@ class ExcelFile: If a string or path object, expected to be a path to xls, xlsx or odf file. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Acceptable values are None, ``xlrd``, ``openpyxl`` or ``odf``. + Acceptable values are None, ``xlrd``, ``openpyxl``, ``odf``, or ``pyxlsb``. Note that ``odf`` reads tables out of OpenDocument formatted files. """ from pandas.io.excel._odfreader import _ODFReader from pandas.io.excel._openpyxl import _OpenpyxlReader from pandas.io.excel._xlrd import _XlrdReader - - _engines = {"xlrd": _XlrdReader, "openpyxl": _OpenpyxlReader, "odf": _ODFReader} + from pandas.io.excel._pyxlsb import _PyxlsbReader + + _engines = { + "xlrd": _XlrdReader, + "openpyxl": _OpenpyxlReader, + "odf": _ODFReader, + "pyxlsb": _PyxlsbReader, + } def __init__(self, io, engine=None): if engine is None: diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py new file mode 100644 index 0000000000000..df6a38000452d --- /dev/null +++ b/pandas/io/excel/_pyxlsb.py @@ -0,0 +1,68 @@ +from typing import List + +from pandas._typing import FilePathOrBuffer, Scalar +from pandas.compat._optional import import_optional_dependency + +from pandas.io.excel._base import _BaseExcelReader + + +class _PyxlsbReader(_BaseExcelReader): + def __init__(self, filepath_or_buffer: FilePathOrBuffer): + """Reader using pyxlsb engine. + + Parameters + __________ + filepath_or_buffer: string, path object, or Workbook + Object to be parsed. + """ + import_optional_dependency("pyxlsb") + # This will call load_workbook on the filepath or buffer + # And set the result to the book-attribute + super().__init__(filepath_or_buffer) + + @property + def _workbook_class(self): + from pyxlsb import Workbook + + return Workbook + + def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): + from pyxlsb import open_workbook + + # Todo: hack in buffer capability + # This might need some modifications to the Pyxlsb library + # Actual work for opening it is in xlsbpackage.py, line 20-ish + + return open_workbook(filepath_or_buffer) + + @property + def sheet_names(self) -> List[str]: + return self.book.sheets + + def get_sheet_by_name(self, name: str): + return self.book.get_sheet(name) + + def get_sheet_by_index(self, index: int): + # pyxlsb sheets are indexed from 1 onwards + # There's a fix for this in the source, but the pypi package doesn't have it + return self.book.get_sheet(index + 1) + + def _convert_cell(self, cell, convert_float: bool) -> Scalar: + # Todo: there is no way to distinguish between floats and datetimes in pyxlsb + # This means that there is no way to read datetime types from an xlsb file yet + if cell.v is None: + return "" # Prevents non-named columns from not showing up as Unnamed: i + if isinstance(cell.v, float) and convert_float: + val = int(cell.v) + if val == cell.v: + return val + else: + return float(cell.v) + + return cell.v + + def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + return [ + [self._convert_cell(c, convert_float) for c in r] + for r in sheet.rows(sparse=False) + ] diff --git a/pandas/tests/io/data/excel/blank.xlsb b/pandas/tests/io/data/excel/blank.xlsb new file mode 100644 index 0000000000000000000000000000000000000000..d72fd68ab3dbf214b3b0c4b5ace56b34ce627b13 GIT binary patch literal 8908 zcmeHMWmJ?~+a4H77*aq$X^D|e36Vw|P(TDhKx$?j7)lzY8|e@!Q9??(L-+J2YrW@LYi9p=p8Hz+?t5QbRSAei4!{B60RR9-fVZI_MgRr? z5QPl@oCn}x>d0DJ*upGq_3t`a!EE%n9nH-clCdz^UjZ;t-~aRc7h9kr>Xv0KFUWT- za+f|!H=VUQ)g+Gnk}&I+!bb+fqNzAnx-UNkB2SM;HyJ(HJ88>`O(O^L=v(aFv}VNa zdo{JjdD}Vp$s0u75)e=s+?$bbg_Gi#+{w)G@P6iQCQ|lcSp|Yi6+UcGOUYPvfl-x< z;dyfoP?k+#*@k+dr^6@G6g=r0MuDnYP!MCzq_; zEFV1TcQXkAlYM`G+iT#Um!WdRy}0n;Li4n_)cqk{Y%+wLbZ-Z@#)Ri%AMP^N2okKJp}?(|CGV9p3@C`sIXKqKJyr9N4#R}P(!zBwvyrhzHhSV{VE5ws578gr`Q!Y#G?Lyfy1WRd; z*S0!dR*bdvWK@aBapZd1sd*ZuH)suLtanx9CQ3EPwRf_IaEMrY>$$kSb(s&pL-(|D zM-4`Cfnz~^v%_C)@0uECeNoFZ&00CIUy`iK<5wY%)T2YGPE4gLJMTVGmmZAARc-5< zZyP9iZJslCq3h-Oa$7j|X2Kw~Fb6%qj-s%B3JpGOI)2c?+k_Cs@e9-!*ROTW*(aao zH3XcehIrq{)SnZz)5W|$L1a6Y3~S00-}0Xz^Yv<0y_78;>lH|!;f7NE=RE1dP6*vW zy}1Dd0B8Vs7>;H9xB`ZyPt}PHq$y@67MyiDzCvO zTq7n*=fZ8kKDtVqfUJsAd9#~{x00@hSwyLXSbJpF+A(^9TORO|6Pm{x9^u&z_)j@; zJ=*t&-)q)b99r*Td1{s4-y2WzBt6&P_oZ!#;z`>gshx0`B6kS6ZL1-eC6);HRxI&C z_v?*Eoe5AgSSLJBCkmD*`DuRum~~#IAd{m0RwbZ-Ru*zHF>*CnXn&>c&^@F>u33eC zub0u9zpp@E(@Hlyfje-eO)PBQ@DoQ}lnrmgGsCB%F@8$5B%v-#LmQz)kGW4yA~q6+ z13xTAM)hzO-kpDG-5mM#-UQ0mKL^c=h6SA|)EjkFFc70wRM7kwE~aoO!a&eS3aBjQ ziY-e7WDXMsMy7$XrAisS&XLgYQSr7yXVd@?5eaOtp@AyQ7-}&05D~*|z(P3lm`{tZ zo;Qj=71)*m^?%rGpa@ljH^bR+o>BsaoIqfyLMnFk+A#c7rD}dt(3c?478R_96*GXK z9|U6J2Z2Z#=B1?NahYW$nd>eoP=o=RttlC}jn=W^5Y5#%P+Xw75_T;a1AP`Kl|HEz z?^Z0SGPS@{eq%vcBj`L8^&CG$ufzazBp&nb@x$3BrK+14y9^FczE(ICiZFzJi~OI> zkt`;V5r%rhf-(oynK^6_Fqo|k&-aD@TN;@{p)sxOt`R-~H69qC6Il#@0UtF0r>|Tw zKff*ir#3?cGR$RVX|kA_3{NKFC?mNDU%qlG>R~GCyF*meG5b_BY1X_2d>MS#U~?Fv z?q1Ru*)S&Ny2}c_i=4MfZZYtgDbxgccP*Go9hJ8#V}cxF2)nB%Vq#K(qVnM{n2+BQr})}0ax=Gmhvd`VE~bs}1oq}cny zf;VeS*72l>oky3;`3TgZN8mo~epNE~*U6cr#v1dtEb!f%e^>xj(m*(qhp*NQZ1oei zc^m!J2MxoJi*!;&&*}xEgJF=(-5v8UgB#EHC6i8yeKf|{14GSTA@8f!3yXCWd91Yz zEQ_;)n~4URp$2RoJywp39mPhOEZWXw8Wfa3CGeswvVZ_Pij{TheXJ3YmT7CzZA6|( zd6~$!#wQoQ zG&%1uC?w=g_xO0b#|rmX`FH@GzRQa8zYc2lBm6A}Y;2v(U^d^vH#MRK0+c3^?@+$_ zF}t{kWb_b|HhA4ayq_Rdq{o^$MwRPPU7$;c;E@v{N&JP=jZpO_}BIj zCZ+Edr8^dpCJM{=PvS-`VvJfg;n@!-2~=p))je|(BOEsK z8}tfKERDtOX>QnM4HjHP^e7bQ^iDfUjK6@}KNJ$Uy>!tuR8i62vy9=2Enqv>yy|pZqasO$j1+6TV6>W+=kApuE$#5@oW@SEY?pl< z)VZ0mC)Kbx&U=KqTh?pRlnf6;yiHb;sBTp6?%G0JJQd?dp@r`J0!dK_t~KisLXM7|n2(d9V!=^pYdXj5fnLo;FsZr{g1{EI71{k7 zE7Hq19PIgsyZAbyG|z1Qh|oQgKX+nn@9R#&OALbvT%{at`NX!G$CdFLobD@4{A&+# zG?4H2F;Dv<+P_%59WhORua$-al@wOm-xn_|lfKDl2 zis0}#Ht18A<|(?6Mo%MR7LhqJO4H!sSEcxFJC;4prluQ?UXz$F^fqB8H<6yHJcwkm zalh(54ygwcrzd?|=?i^ZY)!;3X(f*q_q$T2{Uq-{Ao@%c;>Gm2#ZvNcYdN%wUgd++ z!G54=1?QC8@#jc=*OLkwyq%K#(}w3Nu1|Jn8t%P3IT@SA-;b`IW^$HeX6oXy_TF{E zDy42Ji;FA}p>JTX?@g*^T+cNayO#v45NgWCA7IJdTE$0jX;d#49fsuAskV~mggG+@ zU7F;c&u)=_4YWXLV-ttMAE|QbwmvO$2_k+2-LqQpE|pYzl`ius+GToL-lpJz4du@nPiOINWG}48Yx!tEh~{f~roP^hP6gJAEUEZMX$Zc> z9@N46N(dUu=3gl{qkE^Av>;8<5g)U~WF1exEpv%QV~{bpj%gR@1OPwnqPwTDuD7b` z@9HZu>1a~6c4x}_a9hF34<`f?h}C5YPiBf9IOnkGAo5UMS^2q#@7nf;f1}$#FSSpu zYjWn>aUr{Mju&Q2I{6ry&-&W~G^;>+^k$PIj+ASp8L`&qT?Rf(>{zbltWC6-9!!<* zCacL352Pf&=xOVvCANu%C@rAzNSc^}4}!#)9G5^W_y;&x=wZ z?TrYEm@O+-goc2~oBTcwuzLO%JMX@dI}f8ZuV%%jEhx6!>gN`I_}I$Hf`)T$-1{AF zO)=Jk2VNXNE}Gl2G2sEb<-ux{l;?c{LE)ec?^M2S`{%}YuS$Qiehu|6tz-4N?vOZD z;#ph@Kg8*7CreE(nZ+TR`+H(40}#`jf6F-{NdsPtz4dBqX}9co`dYaN-?DlH}V%}8a{Xpt~TQO zxHN(A(Bb*y{?x)&m`ihjk@`f4t2)#I@zECItO2q&ls|00o|Y5UK~7p{W$oQ)#=6Vr zt)G5nP#}qHRaW^%C$UvYdZqb;cWt6v@27m?YKH7uBS_M}Xg)`1ex$UZxs&VVbR`3~ zRAb2^Nc;mhZHA{g%82H)L%HkloUUs2ZCfnoGHpt#MaZI~Zbq2w1&GWNF&2m@%|WWP zdWW~dH`Vjdd`mw}_IRQ{Vb<^yO?#~AOB_Yj!O*+xctc*L(84ux_>S#bA*c4IX6RjM zy;!Z1rkSw(@Kq1wW6M2NyiAMtH-~XRtI(5o9VXbpK#3Xh<+erW!8j5*`7+VjjL%EW zf?gPSYwlFH!TFljN>=oE*uy85T_ndwd~l(nEP$a;&Z`&E$H*+J_MCqE?1+Y7&9qB$ z1!jDlb3irB0cET5Yz8*xE6(r_oLv`}{MTeH<=4mj;5a5gZI_Qkbc$CM%SC)^K5`F3 zyR);AJzY0Z2jpxX$L_Y-M61&N70!+8=|x+gTPfyyr{u-Nck-u^<)yuSBtwxb_)0x1 z491woRWJ@$(#EDe#?F!TXLUhflZbOwb7WNTg``X%yDLXmn8gW-l>fOyS9v8M$Wc9w z7WJe1t+#zUlm31PLk)h0)=oDqpfJbpN3re*0{|vAi3d&^KOgGjU*I_?*<@s+rXf=l zF+zeOMxd6)s@9fPHavz_R=@NA|Ii^Q%OylgBd2*ueAWmK>AhD=5};tEFpX&FbGlNq z>f28=XCHBzD#^sflzni8#S6AuJXtgsT-(@BrQRr49||`X zEYFFvV7vyyj0vH|eo|}o13b?2IlxdF%m7kWXt9DVve7du=^0}{VE_c?!of#(>@k$p zjbExstHawIZSuHf(%S&SZaJ#M@PcfB)>XK;*mRO0RDO*qe^N35SsX!S38pLF6duI? z*b(XJTlB7(S&wLGL1>q0*35P0T?C<-QGN#WynB(7*M!$4I&-!XVo!&%RjH(%%L@bh zW=fDmm^%hr%8Tv$G(lQk=3+s1)0mGppvOuX7`#@Lgz7e&^W#oA9|HOi{tYFM@M5Hr zt%9`>W(-G}_TAIs^>0Ksnt3;FiyTzHjoGRU59!`CHeq@(Vd|XGyhq&CSb96|0l83^ zO9UH~fV+_BjjWkGZz%yW|0S1>p`lhc zJ&&tmaM*o^OqX17Sk0ml@uf0}O1uiEL6_{3n19RgmASgE+ z8q-W7SeK2wkFeaXNw$?|74&||6?s)il3-2HI#8OBS3?DFV2G>HT*BNdwhyJSIT6iv z21}2wwP>5WTs9!7{B+C1Z5d-sNOWCVj`5;|peT54sBYTm=_fuN=4wg0KWszAcO)Ude5ItYG}6Vh<^4e%C7&53jY11^`9&F-|h(C ziI898VPJ9r{_0xzkMQ_o{lz7*s?whU{(LL-hv2uh7S(2cxT8WB{4-7equ>(iJone6 zJvt6Lb9;tFf9@w97hM?r5PT-gjsGvI|?7MM|MucE(Aei=y)_XQIR?y8U;y{dYtEpxhbz zf)4oCo+Bke2mFrv9SR)rbnjfV9=!EFvf31ieYX1DQgMLutmsb62tcHJw^?O!eqtgFd`z8e> NVF3UX-zHzc{{Z37J9_{C literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/excel/blank_with_header.xlsb b/pandas/tests/io/data/excel/blank_with_header.xlsb new file mode 100644 index 0000000000000000000000000000000000000000..3c241513d221abd0276082638a7e20f1a0c00a6a GIT binary patch literal 9129 zcmeHMWmr^e*B%OmYA=02crNFaWF)9F^nH0Dx^Q z0DugDi>@zkZwH6k!S89gIY6BZd0lN`^y!%B?5O~B;vro(kw{;_Ae%#IK5tN<S)>cXWq5WPkF;Ph>=|Nf$Lupg!Eck$nL48(TWv0vb zWlRPB%g`-Kih9+ur})5(x=A^93#OsC<^bOt%~wIsS6Nw0LX2K>x^Z!<7T%c>on>$}N~f3y+|MhTajqRekez-lVxs4OQwb=Hj60BQ$~&#-e3LL-9#J*pu3- zHCH8Jaq@&1I~ga1!Q`OvU**T$maFgK?xt{TWuxbpL`0zs#ez(TC*sR{U9ri^LIMV3uIrui0dujYTMj$NZA8jB*_Vg43p#GPDmiL)%+C_S)8Zut-kpX?r z5ejqS()QuVp>Q%;SD0#~8W0}UV3>?Yko?X{k&!WS}43SA%B!?{Ep{R}-G zzx9}px=kBWy{!Sgc!_I4bE7**V^>0hyRoGGnNGa|M_`)#t%s@M_cdc9C{L_qsy}Ky z(v%%a#8L0;neQAddudxRcc~|ZtP*aHwUIPLCCo*~udgh8FM}HIN;Y2T!rP>96R=R9Ax3^!vFxN0k~+cFuvbH#l_yy#?;>4hS$`}?vJoRL)Jg!xBuH$RZP1z zGXE>?`k(m5dGL%{$zO;n;qZrzZeeJ-9nmA01wRFz*5qBfPFVLj4u&bq z$;1+m{pd#UrsZ^|W7;51ySnP@PK`nKO+mx~yhRA4#7t|8rn82OKMqaPi}+8knOENCi&p(f*rB~OIG6e)}mlSPm(gJAHz0Hg*}f;u2G z8UPg$Nh}T%V|Az*#CXoaGLAQu4R7WlSQp#~Vg_fTceYxznzdq1!2?VK_*0chB5#_v z0zzg1{W;)L0saoNr&x*T%)Gh+prBR|NWm)!fOYXoD8X?kffJ>+3_3l&o>zHLT?_-( z0;4d!IHjc#ComSUuMVX@#)T?beh4-G5U25!DuYA7Jg&AC^ju>p4~uMQoIG_&noHVf z;|5+4P6g8w6JaJMd3+{hwEQBN(!~UFB9U)ckXE2Pw}O);6bg6ZJNx7R9#7T~NL&ZI zXS9ECoj0-s%cBJe_-g>T0~AX6`QiK@I!%;F(U;X^N#p8rd>DzJnkqyGfGa7fMkuMY zhAFAy_9&^d96`n49Iym*4o%d{R~9WF%E;8ftQ4@weG_`+4| zaET-At(}aQ3=LRtGa=3htDAfy8ER(=K`!wZS_JQj)Yc*2&>*#7`bmr5vctJ9XSo5^ zU8m9r0%A{APKx_gj4rOoDkSi_0v|rxMr($QU?;{$Vp>g%kHMR!r5jEO1q^n|G?4IuT@Gn zk-oV5jRkksjI{ev2|J$wkNXipw;{gQwAWSXu!&zE=N0me&EM}M_#gHWn}yOF z&c!6pmN9n49C9Cro(=1|F?I%?GHoV}pDdtRDeFIM<^x7KSqud>xY{{*asT zZTf^{96dWyrUp0#TaZKtNo!tlvB`mnBmIe?IV)O#cn7wX@mdG1l6+rPQD9HAvBNN} z(p4}&Qj1?~vG4}wV^-uF5b`ER-pFuuvNUmoLNwuyR(2Ln-<3<~Zmnyqtp(KAA0Y#k zGoB%ZA&}uk9ni2g$OP?g0$?99t;LEonm)4YBK#c}?goRN0cJ+Gt6<0i6}#20w&j@tbvkad03X-pSjv#dk&i9XmG@4pt&5t^V+iVTu!r!EuS7xzB(AF z)-;sN245Xes%_Mjcb#gWeB{Ifn#o8S*Dk(z!!3sY0g-%ZK1`l$;yD7Cr}BYl zMrSqmZtpn7=t!bT_^pP>{_T%yuGgEzcV->Dm-+KOm^^M+vJj}+9()k5fBYQJ(XXr!56SxOP>V94QIrR`FQv6+SJ_VO_l_DQ+{j zcsAw$7foKq{7DT=E;8>3FZ)Iv>avj$6+f%5X_VJ$cXr?^9zIG5FW<(4R|Q-{pY`EO zk;!)^h>&B8i{{w)CAHI%EdWcxaU5BXEEQ7LJ*-Y#=l1&A z!oO-%pl$PR5B;=1y6c1-QQ7qM0_Jd@gn&@b*-)ju_>e%{A?5zP3m8C%(5&drro~(KB zVgo+P1Y8V@N??P8c(hK@hBy0|60?ZRky2QPjie&*9&g)!a<{f#clDh_Pd40uT3xsC z$>g(47aI?(?dOs)k~}@>-^^a<-(+hcPPrm|w7A!kF&!v<*NEr~QMfPTmv(#U!_DQ0 z9=coa-S+pMSXXgRJwE;tbI!>AH!$;TUHDcoGav*gjPT`oBcFKkfn zATNk?X9}g6a%~>Ej*{5`50*|5^#7tx!%|kiFQQ7`Msjm`JxJN5i5$#1PiNxP##a;0WoV zy`#NmxS|u}86Yz0YE`~^d&=){OUWS+J6z=nW{nSDfTLzYg<<`UAXRO8mH>`UG~HA*P-;m9 z8;yt;IP{UeuK${re*=&WG&LyM$sEbCER*i18h6y`$YLKB=;Zr`tv^<(VW^9tcx0BW zvMTR+Nrp;Sv*pFOO$Sy>Z2^&2Mg3eHnnm9p`1P0Fwusfanirq7pxl0AfLGY!p@W+p zHTT@OUjsNkK-FbxU{GXEz8Q3UILnc!aM=Vn@0Wj{E+gajcP zSbe2jUQCtwlp@Rzv3t8nGtH%@p3ZK2o66E-!Uc zY@9rNcnz#VzjQkyAr8yo&McS7r;|QYVeOhAb8&$V9v`u>B;EJV37ijP7&Awj`Ac(6 zj9zlon)0+QO;~#C^L_9Nv4acq=nOJYoe1&NM%Y=l!ByO~37kz754)tY3Z8b8lQcLu z`ZdE?cffx4vN?wY(nwe2)vkRcb|}lPwl%8g6y{q`;j3@B z^**#hv}6tAb<0|2B8#F{ylo!Z@2cbG+P%9zf=#djIjQKj!V1HXoFQNCT!ie8+t^H| zyl{tseKqXpgfVW+ofx`ii}QU`ielp1MbkEwh`xT{ za10Bc>Zh;tX6R-$P%clB=9XQCkE3hP8bUd&qA%3Uky4s3q~&6;dvf(e+MWE8qA5(i zB3p;7Y5%d$6!>!nMd>r)7ZoxWeQ?t5=+POgW!VgGbF{ql*x_6p1~Lr{s2OR_Sc_ZEIbj2^=Eqnj+p6G!=#?X4wl*ayrU3qw_nQ zwoY@MCyW@)L|KVSuj04M_1Cp@ucg_beH9scmC|gF)br=+{$I=sXS)CYo)^+ISmXYy zeLU-#|Ckq$Gc*DA`x+KsWx}j8->NXOT1Ux2baa!Ia-MZTOr15-k%&?7rRSMb6NDuEU512&yl5XDZ+&w3<@8(Qn`HUDjlP1sX_5Eu z)9>$dAF`J=adf+G2Z6etO@F=7#sG9Ipf`3FKN$J(FZ?2I!(jI=vLA#Y+h`IbDFd-L zQ+Kp?aN;wwcZB|a7V$3^gEXCc^u1dR^yEzwH;#x1yO-IMlU3gFYLq;$Xe_4=v&UQ! zeiKVtd%VBX>-@@4P^w;Zw?*`7<|lbQOmUEoj<`!+LELbga$(G zhp8QC6wM5L%zQ)xY>acn8O0c|RwEDE$q8i~2(X@}CN)8n*CGbJIliS}g!MYb_FgGl zg@=q<;E#tS9~wg~##VP5j&p*)Y?2)nyBY~=%jYf`-j~ti6q&`2d!>dL^BU*~k_81O zYjLbtZPVhLf3AP+&LS&8Q{o0Spf?6qeXtFAyiEI9&}yF^JtYirqwB-r!a7Gv8h5xd zsnT@h*xXD&^Mx>_l_VY=%GmD6&G*?Umo=u`m7R*NMtS z)hQ<*>5$e)-UIlfLz*}^{Ogb3gZQ^ADMr?28U*xT#XqF;TPaI|aHvLV$3mXdB4D*Q zAL-2A=eAarONc9f?+Hy5?6P~bXe+q7zL!b0Ua2`6N#y=<00NK7eCnlv=$nh|@mOD; z6K6qt$pJMZgp>x#tTqnvxz871L{Os#ldwWcmEbnbK6z=+7=qsft3W-tz_iES!+E`U z2%RhXpw?KchwYPo#wzUgWBT;Tq=Q#Hg-c7VC-EZ`R~d^YrITz*qbcnh90-|LiK_2}FAc3NTN$yBOZkdS+BWmT;2um> zn?SuuCJ|F=!0qWhwG4I)f_aC0|lX`6ZibMTfzI_e#@YyvirDkGU*Os zx|T5dqg?0SY4OHaqU)`o^_wF5wQu7#tE0kuH_WUUlP9d*b6R(adzuk96O71(B0Zwn zAo#p*h+fIV6hR1lVtyKr?&0B%$2Oa!JB3C1!45AzJ&2hv+fr}td9jULXbZ`{Iqq;@ zFHNe%qmxSk1rL*I!ga@37dgZ51(xqS?=w&FA-C%1Qv6@1@Uw#apHn!=4fLAtuP00RKdR<8T=@?k7O$T{r~|L2bD8EN0PblXAS-12m)0YwRJid zR>J#J_$T5O6$O>*JVznGM@9K1@reqE${d~p5)q&Ro~`a|`%zI)>A7>1)!$HlBI{5Q zP>Hy6glkA%?mON0CoP96ib^@1i^dV1iJ}rvr~s&a{W$>qBB~93FzR`ij|%wrd+kht z3V3GJGbmIP)Hd~eSrV5}QGQs~dDDsth?+9a0i}@S|3CMDs91mP_|B04fN&(%PyHXN z_+RVukK%Ml;_y%L->WsM^xrFjhD=?5tH>{{jGFh(*P`)HSigq>7BYbV0Hny@G$sI0 J^gVF`{trvud>{Y- literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/excel/test1.xlsb b/pandas/tests/io/data/excel/test1.xlsb new file mode 100644 index 0000000000000000000000000000000000000000..d0b8a1f2735bd4063fb4853577f1ecb5968b473f GIT binary patch literal 11359 zcmeHNbzGF&)}En-p=&^pp-W-_X<=wkx&>(lq!Ezr5&`KBNdb`#i|!UdxaJt*sLUPeiSh4{qh~NM3`Y-lCePWAp8#gF)OTJzz zk=?oqH<&=L3mDpt&IDYbB62I0k7vPq({SBhfQjGEY}O=MTj}%h;d1Zm?SeAJ&*;FL zmb_!j*OUCUl#^Z4D#w$L_re*$k4J|EaVXM(Rp~>kJ=^e)Kw@?MeqF-6CNdN)V50P< zsP0scL6Ld>AUCODF}@HwbY-72#EC!fp_Q#^#n3jM(~*@aPnng+qaH7-TTUY4gWQ`KFJIz9Zg_h8H-QxL z%@*q-k;#}c(uViGVjSJ`f{;vGJQN{uJbC8yAQ)#9>Tr(*SB!i$hV=3J?(D!=O_!cP zohTRSokB78H0gTW-RXXPPmB-!09V%W+m(6OD`J3q&LBSfI+<_mGZrwH_<7OcdfYb+ z(oLx4YK4zbB*GZ4%FPw9aZp?7x(VDqW_~k+LAi(FFf=EYM}uJH*%>N8^(Sdd1%Bu_ zMBGtCC>IAIZ6hbRtuq(r_wWBn`oCDmKfHQm+~9XfcYc@j;_)YYd90k;I!OL30UQ22 zqy6RDhj|RJ!vk~XBEDzTr(3&&S@p96Z;KiQrP<2@@2Lfv7Ix^4Yi$qJmpy`YNWXr= z62&g)6l~z*_uOYWR{ctN!@e4vl$?D<^qc6{Ilt+n8U5FjI z@;oC-evX{t^0w&EN7vl5@{R|@6gPtPP>enb-MfvV_YTixHW%JiF0vc;jwm#!Ta~Ir zBsD04rpWKNIx%NvWu-wZfhK4G03`qm=x)pPhg94goNP=T9BepEt?d6L8z5pOMEv%@ z`>IdqL(qmm=1~45=$h980|$DJ@GKHhwM75rwzg3Y?kaWCs!?ookbFfSC)cy+i5qL2YO{Oo z8R2xm+S*g<<%IDGg>aYXUZop$!d3=WY;%1lEbNO`R1PnWo&YBm<2P`C7Dgrh)S+`i zXQicmRXsa~%jMK;UY&tWHqQB!L&9oSY&9s}5nlvv$#6YH@xAhqi2*b^Y`kOsaQd1O ztondRk&>vh+yrH(^sU7(W6ENdNgQK}l03LdYE5zrbytgP(&&E8^G(+Gi(*OMqVe4g zrSIZi;q}bNbKm_k#!9ue6|z0q72|^lw7-U7Y0hcwPYAwvB4~q;_>Z91kHoUJFu!BW zX9`1Ag!!UN;i1yU@T10m1eL&^(FR>2poD_C`^^{C05@SS#Z^!q?pXiBM<+R$V}Uq-4I-%o9ReG^#xt@`r^#&`JuuNdcAumvv<4@)&b zD$Jx7K>vn@BNFsoE5EuWe&!%W9^o)6bN~Q+;TGpRaJY*z*ZDu5@4~Y-H&5wj@r?_4 z&=LRyeUk!)@rI}Yr0@D;L5zV5+Ona_vM+`Dje#uh0r?s+XcpYX+1;p*!8ya-viG1o zHBg?S?{3?H@+mGEo0_Sbx%c*)_2X6zqm;nuBqdO#fk6!YOcmHEEs*DxyUp$p#&uCe z)kKJz>y?yvSnj$gWFG<`3K0U@qxOSE`C;oplg-$C=q9L_Q!uLosMB$|AAa5PbODV~1f2qidMJHCZ(>?%Ie{q%7C zxQ3?fJ~xN^x(=-JjM`{}ahaYbW?iN6iW{?{1?wCB6q9==Avct#qac%9Q_!VJ^!T?2 z;Da`*ur6OrlH*yLtAcfB9X<@l55rUNHxzyFNCGC`4OzOmrS?cM9CIn3xbhaQu@{_X zW1#{?516wj{5l$i1r`S67>>0Xy$z2w7MbHtJtjMEQgbb%$QaLkXlx;^_dL*0JeE*E zL))Hs=WlxUItt&<^WG-vDMrNEBe`GdZ}bTi7iiQK?KyE3x!_D)l*CRtywp*{rUJh!3dx2-$yMNEV2VVr6 zTM)OX5Td02g?)d7{|gSD2Y=hamK*Kd1Q_#m^CH17r&l*6q|-Rv2|N?`ft}`480o1o zXjbp$W?VWxh_{{L^4>mpk;?U&u}D|&-gPipp;))+`_r#qx6ukTiB!_D^#xxv!|R(a z&$;JK-GJjhNhy%8tQbpsL>Qqjq`t~mSu;=~q;3c0O6m3)Zgt;0mg{fA1v@D6df<3` zX67EJukG_&2j3pcb>7I39r3{16em60V8mK7BO3IoX5qTc;e7%cG{Euy;4dKl@Ezj) ze$^kLmb;%CLVvV>uRrKN=&v&92mOisq(A>f5C0S#wtlRowszQpK_tj75>#v(2~zY1 z;c5t_On^tgk9c;$&b-`?+`9fX+%-_%39JAx#1Vr9h++%c=m!xK>RpEDQ83UBg9sXo zPho^HPbWQ$MPrCuuP_myqn(S>@&|F^eHUlNnlV)j9#hO<^GE1yua#ytK>RN87cLUw z!?{OV0E-k#E(2@GPzS9e<$+z4mq2Uok*a0)19saOL9f!X|oCZw}F4FM2fNBD6s<2YUYO$ zeyX$n)R}(HTe`|cP!RAd_lZ`IVm6NgQ14JG2rlYJP;0KDBFs=oicJwDj~W6`o$!w| z4FzRKsb@bTu^mnYt%5vY=rSv>XgQriCbv2v{e8_Sd_dDCTDI{piIjNYVwfUEHb~lr zE>bNZ2CXi;IY>Q^`P_3B*Q%N!(cKf`TAu zluZlVMXfBq_gPU`Sth-s#}&|zVf#Xo4py9l*GI^qVno+UtpjepH+&OSQy5yCuwx&%Oynw9XD8kj zH9s0|@$TxJ-K|o-L!N4_*B;cQd=*=zGM+XWN1QLwjkF4$%hk#njiQ)co-ntito~8~ zCVWl3K`{8f{keCCUyc^`m|u2d@JcY4aT*U$h^M5ZdXVi<)=5@#?KJEz+cKVI6dN;8 z4B^;c!&VD9L1O?W0MH5fUavaOrd;m8?ciMJ-_Y;%s`Xm~5trr2nz$DI?tJB3d#X

    ghjW4ZOG zpS0Z)HLEN!s?C!rTHY9ZZP01*+#2;#(GqYFk9gu1`Vw+=^~>mG(syk=2?O#~D_^3S z^VItegQa^wYDx1^O`WYxK5Ow!Gt!tmtC+CPUr(P zZo8L!i9&8UTcBV`@9{88R~SQMayEEGEM>5V#Ebaxa8s?CSpIgJd=5rILUf|Ft-_u4 z^_pS=Q;x3*UY235-YJ%yubIE3?p+TL4(1Z}l{q_F(iiAkfe`09MVHVD4Ev?tX5tnp zT@^Z7V94UbYRw)R*tY>B*1a%so&fVZ`B3^7Xy{t?c8pDy><58t{D!us)Q?6E0_tO^AU5C&w8W7t{re6V~6 zTWv-zA;WQjI#vcP;JNfAV(!~dv}6;7Pjs?i;#SeQsF5J64xE&*OxlhvVh!b_st@nb zOB;b6$6xo;lf@f6j&{>^%IE4&boifZ!+WT6%}shwcV^$6oeq%CZHfCHcO{?67tVF! z5T9*k`S~37m2YX_vvVr$96Y6&JKbxWEa!C9pdt?BAZcEZwq=8 zeaI(>c*_XS2kM$Q7S|^k*(qrl;*va0u~g+pBki~$XewOX@HxlO5XTxELh7iciNii90n^|lT?Bn^udJNsf zX(yZx?-RF}$(J!brh|^miD zMD&d+yFS~mtafQNg)G^8V1{k|9A{9Hl>st7;Tx>ua4tsP(vHN3QYSMYHh>#L!w%3DtRX&aJR zRrXjclYKGVOw|e_)C_6q5^x5e0vuWAm^bXlELW%S$T%MReki6UgK->UpDZ5C-}mgo zM3~} zk#|{?um%79RpHSUnLD?~#y46YlU;{nU#AJymdKm*q=|U?m2&Z!GpbU*q6G0){Re?h zw88x(93tPSlq26m4JFsH_MhSmq2I1kWKKr z^WDTqTcOH*sbv#G@U3QXDn|R8?PBxlv-lnFsKQ~FI%w|I%tpyj73Yv$eGcDy6>;Y5 zj)#J2nO@8;9W=C9l@+(NFfs^%?1tt@Ei4v}f^gSo#^h(qaFM@$}&X+QL*)F{RmQ*5?g9_w;=v4H=u~U!GEF9ipJOWNRy0sA)vH zy{+h-oGdaB;m0`v)JBk8qF)o>w|>5!G>*0G!tMUJ7vE`G4M%EM$><43X?g#EzMbo5 zlDICws~OTn$vUFfu0nyKcS^kC`0Sd=JeaCWfhu$1@lj{>M6N-;9P7z!rH;DTpClfS zVAt)d3xL)}O>|2^p0(tvy5=N0ZF{eL(R9ErnFdp9JDCy3?#?%z&3YGj-jeUam2Ml& z=-3={ZC*!FSoX-{w;rEvkbPpX6?LN}dz`KI_@SZIn-u}fJPC2zS~)K7EybW9`vw}m z&{FlhsjcKdS(zb++M5H}+G`jC_BLP_BXc`YiKr4yDo1mD+IZ>NZt_}MtWp!FXg|M#8z~ET15o?ZX+=FS0 zq_~P_x@74?ehZQZjaxK!M|REb@HAFI77f9GmsBqezL^!SL=^Sc?#7k$JMW!6aeZe< zXC}miS0jMaCpp^EHMpH)16&uJc$(Ge@LLZQiM7VGBktrOGMc|Bb)3Jutx6qsD~uqc zq5XEd)bN9ciF!Cz>qIAHJ0iK+*_m85+xJWCYBf)A2JEZA9RYPOCsQwP76fzDzHXGj zZh49pK`q0>{#@_1b8DP1bGfjJGqem8HFEmdIf9L!=B2faE`cYog)eqPia0yEZg~NN zz?4O2EDNTM_GXMt;SQ8suaKS)k+!qmQgX5u&+j4YE#8bBv zzil;9j0p@%zwiutFdxf8jyotVM0T5KiQ+vX|NS*vn=D3m{Ez0I?{DW`=KuHHH_kfY zFCpfh7_mk8o4G%KcW>o7dlU%})fO$xF(yxvPcw${LFgvIyTH%X3`Fv8c8Ew$RIT+` zh?lqmyWGU+63iL4*88WbeMxN}pBiF=+qN@?B zp00I&wep>E!FaJ}8|km3F+eH>l$imdukc3TD*K;lMBLN!+Mn?8V^A0NOrL2slx97G zZqbWO6^D2fn9eV|Oa_MG)jLL42+tK7W>m|Ez%`cD^P~ z9t!s6aQE{{=y04d00kYAgaVE64}I$=ixGOHN+^rrQiPMRin$OCyN#uJL>PvX>d!|& z&VyijRhpOR;sGAn?yUJ!Oe-AvU!4~z`Vx#iohv4 zr92X|GiymZCbp?b`V}*zgzjp53cTeuqJJCOXsO?9LeTFBOuA%2VISaojbMn7^^1Rr z*u6O*T33t|u$NMi3GK|CW2N=PBS%B_wd7QRQ)blNF54EbyN)!GRFU@#R6Z3&y#Gvm z(&*{AQT$cCwr;_HSl6-l8n^x@7mnSTIy>n|xi;4TAzs%d*M?ZgX4+om$QZHXQbV^- zq~BW9q}ECcM3g-IfK<6x_?2IG5(_S>(+5mm@eE7z8->BLZ=(;6s*mlueO0NXPUINc z@ymI6@`f_vjHN|^!<>glUGhx`!u_g%a?ZrqB*Yz4M1&ytk%}gcj{jrgclrKtWyH7G zPk%pqfqH-!y7@4Jk0lhQ4b8voIZ**nsHDU@y;(;qKUe)#aY7FFUab zBd0k#w=gdmfI?CRonj>_DqT#B!U|`{(^crCcyjo}Fj(3NRcF?sX-1ej$cTiRBRNpW za+$1k)N@O7pn_&X73v)$;03-7cnO>3d)ldS43r+3NADB_34)1w(+_M}WV^KF&TRB# zr+44)Zcgx2XlG52iP(e{j=alMA~chkOb$XXU)0o8e2#vcMn!$EY>`h(-$t}#Qdg&w zp2dT;NRIn-n7@;8q&QCtGs(2~&FRNl4(WqF$ev)xilP$}f!V&D8Bcqo%#E*>id?#= z1Z+LHW_Pg)8{STE>^t3=pMO;;0G7~wAv+`V*i!cu~}G1x;c!){O@>vN_8 zrqxcP%W)$g9+cXBf;R6Ay)H`0rZqOid2b+u+}ql1r-dM6)7e0CdWwPND4+RMT{YS} zZP6?Ko?YveUX4Nh96zPO56MHjb+>%eccoHC66(G2PJbKCMOY{o*$~rp9r5|cXnwxn z{}j!sL}CPJ*jl>*fv;eOPy!mvg(r%C(>EhjLTUWov4H-&r<|vhKRtzj{*R|*nMnNv{U!eFCoTL0w?ng#IuCiYsTqpbk;jfxIvMF-4_`!c8m;wym`F~+UWkDBGh==j)}=O(&%HgLbKg%LEmaImasV~}2LJ#t113V9tpYCq09Kd)02u%W zXe96AjI?$}n(6ttS|dz&y`A7p*_c4iOaKu5{r`@C@eU;Db!jy7gX&m!h+|)HdUjo4 z9xlJgNlKNkKw>*KBdksTezyj4>I+?!*Q$8#{@~HLP?^w7kDZP_`-Dux;6w!BQ2@mJ z(NM05&AQ`R^3_tzoN_}NTV^XG87O#mCSuu$I0-aq+4hk7G#-?Xm(S+D(m1HGDFtvY|Ei)Uz%)z84j(OK*9%07k#`+Vmp&Q&m^cQaqGd0 zmMNZT@$l2(2ejidH7DxpV2M_^TLeTYTawrwVNlnxY|kpzcs1ZoPzEZl?nJ>&wd_23 zCB_%7avp0rDBB3Ocp1cMYSZp4G!;~YW}DoLtS74U_n~W4w;03nY^*mR^xy+GRb;xX z_dgg0F-@+-cySDRi#?cH@_8;JQBjg_;;o8MJ_!8OSlCO%rZX71K-1Rw9W*Z&mzFP7;)T#tzBLo1e`ZA-aMe(}pE zXJwqMidv?p(;#l*IfyfDMPv?}4C(;JkuO|Hf4a5Tmr*y}GnD^EOo68)$U{5GGOzX4 z*p2Ohx{`F6R)yYv&L|!+_Yf05|0=(gSRICk_50e^6jVGPb$9#1v{91Ums-lY9^GtG z;JTkBuX#H&%1t*hn)=K^roLa#S66mC<)T*4z)H_pMXpovGSxr^SuN5AYd3Wq4CP@I zG*X6|<3+n z*bAE_y%*fBMOP&8BsiQg-yiMzuR8g(zxVhMy;6hL2@L?}f;XJ+52<*$xI0?9xH$4! z+B^Rxn+xbfhgRFa`zX?B{6?GlZ-IVGctZv!E4NlnxmNBdAm+7t1@yUP*}z*4?jVoB z09C%k^viEiyYu_YIa@6kSSuQNI7{g;Zy6QyLJm9OX96T9Q(MP&nXQ$ zS4D|i;~1itH#?YjJKkxAFA8&=;Vn8}7g|Ird}$I$dw*iIyx}6WyFTBkiN4*x@}|3j zcd{G3f`#S-G5U(;%|99AU<0!=7q*naP?NcfB~OIG8UqcEE0U>X4#ox12vGBP!xpsx zD&?Vp34;9jr2>NddqTo$RQ+UF?)B6~)Rxq7)H;DSi$aS6FreENc>#H_)>9m14j~wz z)jhyrn0gWnUKZ#KQolf=O+Wy-O+Y{bLNqo}&_@uQMi3yVZ^#(Oq}&R49o87njP;QP z$|ON;XUYvq1RRl3gO>$WHi&J2g=!Z%&kT-s;#_nBq?Ge0o8d}I*w&PpR^XfX{J==K z1?(Hme|7u_wxGw3cG?In7{WgsM<9LR)`)LHc^cQHf*}i1>{Ao%C@L=lO??4EuClzo z*{uTs`8VX-$&kpAPpvY(9-Q6Y5w}^v5H{fAd?+}x850wkEyiC}csyFKYa*2=AUdkl z&~i)Od%l^v^lqqdFCLYF)qQRXho)FHg{gM~ZcUt3iH?pCCbQ2#p{>~AOE{lH)`?^I zmi11}9^YFk@hj*sm{6R?zDqT2ls>u3HRg-CGB>f?|DmQ5PZDKkUs=5>+qddyue65e zn>^*5WZAWAQ)D|!$oOnACw1n=D++a=ctH;$`SK#TJlX8yN>G7HC(+`~t>=b=GnDTq zQ!Ju1n`1`y`_;U!w@Dl}`{Q7uX$ogbjzW; zOvCPlSp3z>jg;PKYy!<^G--j7HAKVKn)|zq65c5}EzgCa*z|cF@4xER7iXjkY_UpA z7KHjdvz%M;8W#}>V7PtydC2wR?^N0!h@d2SuQl$UX1qCE~o9yP%$GrC`L{Eo8$Xf5+@Kk)m-!Xrhshzq1?4on!;1 zqz8XBvVQ!DudIup?HYxMt)Nsl$G+rjVFltVo=huyiT>8e?TV=kVCDeM&1ClhzvDEU zyfxcfgb56Wcf8i79VQwa&kAW%-@iEldt<4b`qCcWl@8#fLBHQQ!=Bmu^r%5-8!utr zk@oQR^XdIh*rtJUD)Tvn$c`5`f|nc+?I8nFJawzSr?(-9H*&HfizAS{q?fngg%vo{ zR@WH;q{^{OuUILCBxPCi=Ke2 z)}Qs?K~-g3@t69tITHV3-MeJL|Lbbm?A_BrqL9{-?CssPJby|5R;UHG7Ty=APg){mr+g$M7{f-m^0(k`jeDWZd@U?Ut!nu8{Y^d?Yl*ZclfQIn^ z{S_2JNihEte9I?ChScB&j5fRbkWqkfJi?xYp?BfA-!80V!I>EePmz#$6+1a|2fI@R z&KnG(6?V}f#}{rDpp2?(i{h^=+T>+cGA3^4v`AGCK|B_tS14iz$I0i!+*I;q3#z*W z$7CK+f)j0u(JMI>Zq8eg;fi8rQ|`*EMe>^}A_7@bx|kp4Gi>{@$5> zay{!_jTBgcrqJRR>8l=#fT(;Yi9fOk#y()W2F>^qCO|)7eN8qj`w5{0lUu%(^!zog z?Dp!85W>3+-4Y7v@(fyaBGXZj4fw zd>@5*pqpQjf31dEJBHa7*FHG(CF=Op^zHQ5`nxW* zk7wUFya=HYHg={Mny#6wy=GLkNhY@{bBS|i#R1iuB^CLFpL1i_il>)x$%dJ(Qv%G+ z?>dCZMay9^PsY+D8)oyem9KWNBf+*LZ|aEvk%YepeE;$el7arE+i{B_Z&%)so}?IQ z;Z9sebo03p9PQut96oe}HiE{=&P48j0qa<~(b_rD+ zYRYDw=F_cc1m-Up1fIrY)RrW!`I-ZjwzWbz8M-8MpPSm~L-w3(ZX5AO;sx}ylmc%) zaaJiYdd#8!IB`;3&?%#}tZauYp%5da<95~CME3F}3nodb5pPm<0AExnPe?pmUmC&EW`U%7s zNkTjnz%y`tgoy>nK?Kq6deqf7Bx9A!{LrWx7_{9R=NME}=eWqUY7@5u3%Vw!Sqo)h zO-m1I$}_a6FX-kYN=$?ktullxkgde@#v9?Nbia6P4R{yQjGyUF(<2Zlnp_=I{K=(Q zz`C^Xt&W~ZLKN#2eC<(fq=4oJl}^N!PnV(tJrj$;z=BXV$B_HF6XyGwG}FRcG+g(K zO;B)Fu(-OgzSNKw#e~QPw@vp;IELe^x?AyMw=T;QBtP>y)Tq3RV~JUql0?Z``gDEpK1v%I>vu$#H{@UdzbEBzNmC7pdz`8Ut%?j&aa-^YfpC;5$90$*|m~B-wIkZOqd3pWq+JU`E;xZ?KBFRV5mncD| zt&S*0bQ=4$c8WNtH)uw$yh5jX>R(d*k0g)Oti@}R0j1XC3n<1QEm%^^Mn5R>Gu~P# zH!RAQ@B<6RD65wmWRD9mlJA1$*vLE2U1OSy9xzjJ(;?M8?Y+F=d!N&bi92s;aRYnU$5-QL;ud+nIq*WTFF&;V#^sx#*f zVSbdAlocYIlo3e5iMK%0v`7YK0&!=w0D|$U!Dxiv>~1zAFQF-VXWh`1DG9ojgSl8~ zxx2U`_^e#qt^XWV{@1~wx9(1yiDoksb?BD*2{GYVWkN4B9%+)@<&1nVa7?lVnsI_% z77qAwh#K@vF%mXy65nbTcWHn3z6Nj9ylB`VM43DXKt3i{(27%^FytBrTuUM?8pKnw zyGR)s-W8rIn&5~Ltt4umib?AoSDGNoT8=EqqBeRHl04BakwCRFIoy4eV=#Abhr$S; zF)!9EBF3I%^&0D5Pi)wMa$|J+a+e;xj4J`faAs;N8_VIlwT9t?zCaCex6#23npt~B zRKjI%K89%{p6Iq78!{(s6M~G4In+cmLSH1IV4C8h-eQohbAY(hLtGAsFiy)7J)2C{5lus2)DtRviHUm~Zt zt~SW8As_dMEo+P`vMl=GsO$*p>aV>ef1+83i;Ejy&IYC@u#Zaz5eH;<`JG*V=<-`q z8VJ-*bD~#7(I*4?{IgH|v${vOOcw4w*PUV3-ruYH!8mgO5Q`}hC=e5H4^t>cl##os zy`eo5A4Y|-qsNY}k$*>g5ga#U1e*BdXzhXiNqh@e*Z+9`ZO{HVQsZPb7x+P;TgqP; zS!SQ>%HZ6HB5^Qq132YATb~e%Z)p*LycjQFjgv8RUS8 z^URoz<=IHpBTMAxkB*%TD4L=e31+5PX}fBUdA&*8(!L#dRxrjrC>^$obp{>(s|1EjSHDmP)kiuqMV%LL%6Su@s*0Ue@FD9jarLKfQEy z>TK`snxxbQjQwy`qDvZmEB;7~7l4=1x#0Itn*O?De_em`+(}FIXMmpv-hUZ>yVjs( z@sIKMdBdM4fd4d{N9*nXo(G=CIX@BlfkcD*x9}uM^>c{+fdl|NLu38@SN;sx XTB=yzLp=e206>psuN`_Y0s#CEceHL? literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/excel/test3.xlsb b/pandas/tests/io/data/excel/test3.xlsb new file mode 100644 index 0000000000000000000000000000000000000000..617d27630e8a06a5eca901b92f91e896fbb4ca43 GIT binary patch literal 7553 zcmeHsg2LK3AMttLf8hJfbQV;t<|=xfCv4OEyebB6Z&n=p>3{WY4DZ9^l`GHj;DzDLaQ6et;EkFgX!CK?53~-+FQ+t`~|=r zhMpB32#a$o35BXeXH1N0HgWGnJ%B zv_8Lm!vyI6qFkkzm2MQ8j@oEp5}=i9v% zT0QyXp^jUCXl8yq58@$Nf_Ts&l8e~oQD0!3rJ~Om&UOw)9<|I5jhDWZQ0A?Q@i&OE zFX=X&y1F~sQk5s)tvvjOD}`6WC(bH7vN3%9jv?c%)hN(cyRA4`b-mAC$s=uj3BtcA5sbQ^l`KI z^mOC5zwYsuY%tIZ9a?Sw+gpWU`*+&3eoyo}qFeH~1;x$U>dlHrQEAV05YRIF`q5XM zJTd-bQCb37d6!?I_LmPo6zz0jup!!cx$5Y#LdfA8blBCC<3pIZ_FI+PF>2UsWbpot z@Q${mtcmJ@n)L zoi~FB{@FqF8EiBkNYF<#Z~n;`S0|W@t*E^`rnYDP#&ibXksdWR7SML`zjjhL31ZS z$Q9P-*9HwCpvbmT7kN@eia8|H^RdOrR-Su3>fOQBnC0dMVYc}U6xq2m!H4@f zZj&TU;Dhm*&Al6IRpC$j%w|;PaU!VaE%Ijfxu-(0)|Y1X-@I>nhA)G1x&G`0Qegz? zc3pJ?KlH(z$9?;OeWwcNMIxprt3|mBS8FMCf-{8uiIo~E;7a6+_nv{CYW5SaUfU@% zA6uY$H=ARZqSu)=dH6;q=cE{=|VzJDg8A3uctPDSvMc2~3n zah6WO9x&RCvCd zxqcT^YNS4TXJBeWnv1)ac&Zm>%NafH6@(mVurGs{g^^vQq{7tVMk=}05@2U?a4a`v z+Y_6*4%X|sR7+%BdA5}^oQgxJ_k{Lgv`iE6grnZ!K9h7%PEl8xC=`dGr0?A~PBrrF5tcd8UGsgLDw=NZ1GpE}$WpRRl8UY`>;tvW*#)72SojKZi?3L5IOlbTmLAXfz8aQ-ro=@^I#Le{aC~6_>U|}4kI=04(V*5nD z5=TzeIJaM%zOfY4)x)0mZ0=oWM8SJ0%lx1uuG`)NR<_&LGiST811@TD5*@_Tufr}O z=JXS1bZdLn1e3l(pq)aNIW_ExK;4Y%-nseYyL6-W#l}x5*_7?4ZW>^DcxrRN;vN?G zA-DM-qPTKz7)E+{ zqkKrWV5-LO+R4r1PXhG=ggt7MV$Q;{gPeyli{%L4Z}{@v991SelOc$?N5K41+-nbf zM#GOEI+biVn-XO*nui5$%)8FCxt*8O<-U7)3J${7JPW1!8X*_OMT`EvcaF1g@ab-w z$S!{7vK!skkh1y1PdL`mikizsM1FnMS7X;)eS70ZWqDhWp=Tiw-QYT=`z5;=RU~!k+u?73-}f|;k{w;guvDJuTFzMsjFaZ z**-SYvTlp>B-FS<%mjgPm~X#cnw(^=wmJQJ+%2+xc)O$|HSf`tHH-oBHBEKQ)7bDKbh~v7e3U~K6N+M+Wa5VE$-VVg|@=46Z&$@2$hrx2`FMbZ*Sz`5-lBK7eXyB<+L|v zM72#vnXID-t6~MS3GB0v%xS=Fm_05DkhcKK4BzXdjKeEs;rp;tW-o7f|eP;K|t);v1#%KY33J_;y!HJQKaVzBOX~P*7xSE62_Z$JQ<2niQeQ1Y-qQxJ)NUQwh zSt;aLSN_V-SS&MzO^U$at%0AA-h0h{U#U-gsnG#hm0;l0cy_nA1fv<-!+hF#(H&au zgi0$EoDD3gBWfZ$u1`55w#DN#SOdp&t8FpOm@>VrMEKxI;8)|ODM;gsi6%t2;sS(u zDfpDn3Pn$(Q1tAuqC6TXmmC^wq-uI&uI6M7)M~@sNJL0 z)y}1E&g7|jxgOK;6kb3ONmmiUPV~C5*Imu8U(%STK|~XI?01Qtt8Cxid>h5)oR^k$ zlE2#6n!kK0-mr6rvgN4>kLYOt;?0*c*B9&ewB~i!SVXBvRUPHdHpdcB27Y(K!wgpF zm~0EzcpJ6Ys3^0K+YF7nCNJIhaXh(1lMwY$ru6FZkZZjWuW0#JMEgbnZM>8I21OGh ze_<;S^VR64oZmwS>@TyEZRLlw@#d1a_|}!|Nbv{pO(taxts%{$2eTyg?t~RGcT@SO zK-S%EC?@nW_IvGg<&v;yGx|(Ada0-RZK?mc$isA-@jK)}xkK2=AVsNKQUXK`*=ZEk zE0xm}E-sgKf-{KQJ5!$Xc&WffT0h%Bj{s*>0LQe(0X`uVMcyF*FBtn)Qk^CdfR*3+5jCLErfjXJaH^{ z3+@-h*4pcQ|cFUkPqHU*NQAL$fLs?cGk~^J}p1rEg zb@FsIx?tHZQC&gcAkJ(0KGc^Ih?hI2(@(q_v<$s#KWH&OWY^HZ%hTGHEQ>bqXI;iLyhUDj zT&IGW79)=VaoNF5=04gBsVP43%{Lg%E#5!j@ssM3b=Dfa#fcg&%kXZV?|J7z&bC^z z8h-qB&Nw-a z&N?#z^o`t;O|wWhQrGA$Lya{mDF|y-XL~>|Jrd2z-$_|{A2Gv)j=4B`XGHIRc7uP` z@aV?K&L>#I1Lhd?qlO<#w*>&PnX`aGX;C+@MAF2WcshF9dh-cjCBQvn4s<2_Z`TmR z_4Z3b6Q2To2cUlv-_FbHKi+?j*&kbOx`N({ASixE{e+2ivCK#w_i75MtC=^zz4*!I zj6_CHNA2Yi5aRUMpUgj}Bkv1TWCwY4FufeHI9m~$*wY*{-17N20)SKn)=%qv$pq}D=jV+OZsaHx<^tJbVef->r)XKt%9tM=ZyJ$tr zxl23gtNsHE;!vQ8f`g=;Q(ZnX`?i-7sOmAR$N{qp->6An3>zoQ&j zpioRTs?k*`qK8y0K%q+Rn1-3J!oyzUMCtFsXcCkp4|Uyq+16uODA#+g*kWmPNZMcT z1)e{h;u(`mTsK;dwp(TJhRQAmeaQ5D?jwcN`ZCt2@FLdou0?xlau9Tzp|RjIuZq{) z$0L0>EoFgbs@H@b=3ZtN*qx~kp7~C}|8p4iF)ETl?nUn6@cxF3>=umUD^hwu4*e+p zPzeU`GCK4B`}?H7p4eZ;Hp^IlmJ`+9smGf1B^#FTcIuh09M!l z00jUKXasX{_Oy2PG}G~Owe~O(@O5%z&A|q8X90ki|NnRV4{IP%r(3;6h_s%4ha~P9 zw|Dmiw&9l-xyfk?YAE!nspm(Aj zP2qOl0+M43*SL@1_R2QGtv&{E8k!6{3(ZB95jiHeA2fifLi`w;)GS7Eyqg*fh`j{q znk%zh*5BP7CS{#iiS^+c^cBB5x#U*}m8>i)F!5FOP(BHTL>Xa?ag{OUv|v#SBhR8x zHaRSNX0zZ6?pFx=*ohU8XJkyZ<+Ayik*uxc(LsI!dAXO=bV8aEBvCeq%O6E1a!wVZ zLNNUN`V|WR|B1P<(1o@`j5*%G5R(wYTr+oTM-M@PZ|DCQ`#((6e=R*Kz8}L_qV_H2 zdf4Les(tqX2>i%BJXUt43h9~p@ybHdLf`3xQ$Hm>j z(#6F=!19*!-?X`a33M2?{cj&7T20@~rs12T-xA$`;$`R6-B7NR`+|&prB*3cXjw7v zhKo1UYY?d_n4I~?o5S7t{pH-P)(h;FO?=$tjM)AZj%y4!&qjuNvG6Q6N;gB5aM;Nm zyVi>LMZD97$ZNb=q0-i#=8hD`ArA;Lm?BBD2F-cjJVH_(>QTGGbTYN2cvMHV3lpxad#1V#sc zf~U+Sf&jF+BkhN2C+O&wAzh(F5*J>UWg}5@_pq>F*saE~LNJAk>@XSj7Dh!XQj$eZ zYE}WuZG`fL+DPIK5N=uuK{yzChOKzJCQ9o#EVYqW8(AKr8(hDbPH9duSlA|%9v7D# z5%XzlThBTwSHRj_1975Z;n=`49!K{|O11Yf)K zO1DP~Od8TyU+O;UORlIj;gTTsAT>KMk*d?t+@u?tz*L$vt+)thChaS6uzQWW zv3N~JTi|blbhakv!zPR(88Nf$Ka9x3&f413<1at-{RiY9FXY(YWRGDV?(A{ambs@{ z#wVfnNbNC;SO+vSv?8fhH2HzG%I?9w)5h3N%4eDEvzHM!r?~IOIA%E*!du0z_E(^` zx<}U}xp_K3qa6ryF4T~#FS`GwWg*xgko+bUHI^z58s_kV5GRq7bG|m!5~Oe6Q=xA6 zOg!OQ)kb<>3@(wzQ~F0JsanvmwZ{G~i==OQZfl{a7%o$O=ZCL44W*AWAzSQ{6Gahz zPc3Ixe8w(cMl$=Kej0MU_&bxjAL^|C3Ksx~rvFx8e5aA8owbv-;P*4+TY<4|(0~7f zBIRqEBN@CYrmyiQR(G$Al|OQ-NDm*o-s$$9*(ibOR+LRW0|gK)K~0c%3!V4Qh$PR6 z6=gd3#^yt!Si}iokVyOzUBKiNzr+);gZ`6&!|A}i>7hf#!6m*!_EpA(a0=?`$=$s8 zwOQZRc8<)d$qy~T*>hKovwWkt@4I!Im~WbnpKgbB+bM;MH-pCB241Y3gh!66zvxgB ziuw!|Yk?T2Kj)AGswdoX%gCbGW*D%{(QdHhC6s!jGxjwP$wDh_3Q<;3@dpffgTYXE?A)1E0BV5V??r%R*19Pp*z_ILQ%)@ z#mM^6s$fMoQM(e=Wm{qC9Fn?m8YT6j? zLUjEahy{$`G~9eUJ2Jxh%{EDR~h$YW<^EbG9`WoxPL7!I#q_4S1Z3kFQg7vu zs4PqOR8>_1rz!6%{?}*Jd zqUZ?P6lYR&D&Cy8qQJj`okP6~yYWECRKWvvDZQKRUIFuV5QiK?XVWD*^ebkXSTxVm z9`pmqidZ@WeuU*B8+#<>Ghxi|`pZ*3cw&P=Riu-+CL=th1oEb-T*=0!Z9N4uO`++G0kT3!1c#hxD?otnOx`q~ieQkOFG z+WuKMy{NG>)zDP!M4ggR^(KWJ8p_W-y<&gZmo5F^ScrRL*ov=@Wyyw(u}hMUQ^<7) zn}>nRVxHnsvuvcz^ESb{!Ole6vi!+Z2uL#Ivg7_AJD$u;&)tq%4F$UMhjgUG$%}X5 zAIG!|-Buo`JX+=9_jxv^J~Ff$k9-+KK|X=tyjv6@A0)vY_9U~H&5oMf-r6NXVW>HW zZAwtPvI$tQWPmzNz^W@tUJEb>DsICgxS6}9@(N9D^uT*gHvUFJ4+xOGt>r-7RA-ek zqZBT^l;jBsVW-%4UA-LHmva4SNd{3No74R{;gLYU=F-wQ!qNvnx1U3DpiSeF;S z(bBn`7|ni_Q1iW}Cq!dTrOV^$Dt`>hJGqn&SQNqG5PnB{+JRtCvUgIQ<&PWVj@ z8Hr_ctI~S1IO$xvo)51(4zDwH<7CQiNU6zbpH2*J7*6goTRZ~c3J!m1a&EASB3l)=MgX*5YP0b2lygI8I_BS zl5?v&N>8Lmt{qvV=IHJgNLa`fi_dAUPA*937uR~F`?q506B z-^$ZmGtr45nMzf_#(i*}JK05D@j*U^6rG2$mw1~o6eNyH+{Zl&Yk4kykI-E&M*bMoBaABxshT%bUtz$qlI5} zbRdp@pn%cM4a}hYQ%4?l7Vg#vZBO@G&bHsv%(VWFrmn`uMqq1my*Y0~PtK z2`GH-)wi=9tUdhzt;7yEAY20KXCaF#mIWn)bK~QQpt@*;gfW=Exowu&lND1;2Ihu& zC^Ae5h;XrjySuo02wJ(gTmRMK{IBxEs4g(xOru$dvTa7~1f=?Ssy3AUT~r+RoWgaY zKAD};DjK`7iM&#x*9}v+cmtouGCe}1SDvB~delwSQAvgm$nIKFBIga2E9rAn0;xfo z=1kh5REABo8%;Uay@Qf=@3}k8c77y?N-)RL&`RF5k9yyClhUet6fVr*cV~BAD?MH! zM3m1M^4fy0u|b@9mYk8I2Qmq0S(TT+_+*v~v2T5knpONZPzBy+NltdM;AO6IyaFxG zkWfV*ByD*{2J}(aQ)p~HO5(CI8P%L}vHE<1{idJg@rR;po!&C>EPkpCcdVXU32NiI z_sp|pPIE%_gy8xq^CGZ{o=}4;hXH0Pq&~;%vH{PK>E>)#c+N@#f{pS6u;pj52Fv<1 z{nlnQ_UO9&A=jJ5@jF?$r2W(1`ECzLg+OMoHwC(2kzeoT#>PHRSwujOVQEo)kxJ0_ z%dby3#;^R=D7P5jw~JtAzEz*hnDJMA`B!OxskbcL{jNJBtbMbmv88;>a%rYe8;W-`n#|t74d9gbbbVYrp}qOVAN{12Cr4i7 zUg^!52Vx>y=r4O?^T9I{C1H{6b)mhs65Wlr%;>85#;Q*I{N4(4Eb&@5goQEz#v4xD z*xnCZj6?t`DCeG=6Km?bM%SWr=lDAXcq(H`bB5cQ4!xX2OX|<;n#kUC?HPdrfO@iC zbZ5Bb5VwJWueWtg4+B3NGRpyz=h?6wUgmhJezElY^wFV<`H;S3R$G&!miS|g-A7iB zru@<|J8O&7Q0r3u(Aj*HFK|TAsq}_C&s9Uutj@PYy|#uQsJf>qJ1~iTbq5djl`uA{mG*1q*w>pSEN^uH&x%HQ2W28xwC7P4 zi%f1}(lfrxi7v0)ucB2C2CHRj!i$?wXYJ27>c5TC!UPvlV{^j|;#*FoELtFRT9E zvA?gsd36O>{Tblrp84N`->$V7TKubneqQkBCgQ&Z=P|YI|GTAl9_M^h=?4-${y#3@ z@22E=;q%>@AHpDl@4`QNHRnao_aT0Wni2kU-~V4%;ymE_vi=941<`lF^F{u7fb;3n z4*(TRo8V_>_#=5bk8(bt_<`b#$uBS{zoZxE5q{1Me#ip=OBjS-GKKTv=e_t30DqeE t%Q)}T&tv`UqJJO(0B8)>KacX4!-lKkU`!7HAi{i@FxFd)X|MnQ{|D+kn<)SQ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/excel/test5.xlsb b/pandas/tests/io/data/excel/test5.xlsb new file mode 100644 index 0000000000000000000000000000000000000000..022ebef25aee2403bdd974bd7d02036e4ba49906 GIT binary patch literal 7824 zcmeHM2U}Cw)()XauR%hUNbkK!5s)Iica#zcHFQD~2Lgftks=5PNC^Up6sZDIq*p;f zx&cJ0h;#vIH;yyc5&gbDaA)T^=OoWQ>&@9~uXnYhuZ4>T0uTTQ0RR95FcIltkHi50 z?C}5q8UP{CO4-xH59Z-#V;tlK^Sv$@=niK^;{myI06^^h|BnA*4I~@)>9z?|Hn43{ zB$jde_u)X^R-NOfrhBYHX259eQZ{N#3$ez^iKuga3a31V-hC`msv9lK~nkw!TQ?Ti5*Cs95oe$4K( z)H0;|`3eVHV^sczVaYso`?2i?YjPzOT-&~_r8AGNdttKO!E$)%I>()%ius^`(!}1h zo#|vvHC|(Q@gi-3KA$iXNmw@#Yg>w$A-+_#k6l=tI+V)k02kbFabrfcKHw$M#CfFZ z%4QtgTHo1UPws2Z?2| zN%XO5Y$%qWr>D38{hyet9R8tW4{MIvSYnc3nQP+%gZl~zemnom*#BXg{$uH}_Xe?y zCGA{SZ&02;{OF-hm|OjV^$~`WmtvN~gR%N<9=qb+7YDA#;x)|2>)V6b4VYKMkDH`b z_)5bO2I0`c4)ZaSjW-RYS&AJh1FyN`_@sSqUB4On{N_@kAxli-t^tgej&Ip$YY=6y zCuhLlUfFlwv{i-YcCNBsa88`JQA#}c#8t8JwebxjrSXh&`mf$By&9{|cQ09_dy`G` z!p{+ZD|4J)lJA13mAa%&9s@CRA#v>T^UOH)X*znw4Y@aq{^--Pj%XTs&0Cg0o5f3r zYe35xaz6|j)>S6Ejhdm3is;s7Dwcf^ap%J0Q0&rw<;ln0?bLnjOg)w-3;;r$K)BF% zsswoYxIsNV-2|a79)HsY2OH?HZ2R9H#fB~4%%<_1qhA+aQzXpIf1$1Z;>tl-Lak1< zkm~ zReUyTc<<`dT`~XkVd^@6)W&!fO<^lYRe!&;cj@c!Zy;NWY_l zgR`9jb}heI?l6f*-WROtb72!fvhU0F^$UW-e817-kx`#6trDetKz(Ds<|1c$tuIm& z-EuCe)+;~p*%f7=0$?sD`j-0Zc1+qnP5?qDG>lK`G7!Fgu7Bum3?K4V7cq4#-cV){ z10iEyuK<{e-=6fw@Q4B_uChSzsb?}Q-ehWIH}gG17yc0j{>px8UaBI= zs$0~y%l;T8OWqgl#mJJ;iD_VEs_fw0gg6HIhy}-3u{1xV^B=UEx!YUZ5t{jlUGG4o zQ#jsC=Elnrklqn~b;L)B-VrUe42Aa+SEj_XUi*_g_nzGtu|tkQEeL(cfc+QM&_0e; zCXY-m;cgcr6ov}C=u!tBe7^JYWPl#IwsV(Z#*?sWmDET;0$tL%6o41Cse2d3^Lou6 zU(+fw$3@v1CDN&xQ&B{c_D0SCY&L>$eO4b>$MbRApMjzGK0>3qVY;AamGu6?9TAiu zm@a~&q8FNf&Dwdn!7z^HVvZ#Q3?f#|4c9GXV}jcUy)Fq8(6INQj8m+Wd#=JxC*O2w zC+M;SykK0X%!h9h&;Bd;LiwddU6%+Oo*Khz=*eB+1<{aR0@a&Yr_xg}wJGhaoosZ- zz=nu@@^d2c-{u>+2ApXvSi@k#&a!@$E6AbsAczBMRrGC4l z%wQ#Qy>Dz)mYb)We5~8SmJ2!T6}U231%1L{5khT3dmdMdXGPhqoCH6alXI^A0hHX_ zwZBr=xlB6ga?M)CKs*7dUID{Hq+C7uTbSPN)&<$XjJ)i7=MnoMG-Tp+C&r(r|c1TzmU3oS=CVopDvUrOOSk!ZlnKY1t+QnsWF`u06AfMi{|I%eHDkVtcL z`h1hADsodrzWCHZlM0C5dUaFPX4(UiaNxTggq+`-%HWRwJ_dMG&RQ~YuNK#q< zzDmdG$*@!N@z;EM!d2&2%L`+Ar-3Is3sTvq2W>RC4YYZ=#1|4MxkeEoPz)3jt3#mX z6nA?w!-5p3B2EGd^Nd&(q!6Fo@V_a={K@)K+EzFE=PrUS))!GZXd|*lG`jV~F zIisT!aE<0B4;2eP`bph(VD6j;#1H;)>S|)j-a)mP64>Pk zW>gDEr4%T=>Hw;|rBjPYHFsBuh0rJDN+yn~onyb%z^{V`8L@=NeKvPj)5-fhYtKwX z0yMxqz$vf<7-my6IOT%`ai~b-%EpgvpKtgj(TKkyB>TuM)!q z5RMMrtf72Iuwcwn!i5T~M3L1s#5|A;J=Bs>GA|+}d{Hn4i5>v0SKZ#Lz*Jd@hzMpy zkd;qI9#UxfofAN3fK5C6ql3#{uhUMR%<%1$Rv&#q1POHH3_-=xKG!as$iAyP&Z5*j z4AI!MyBIJ_1j_NIV2FGu7d&?)ElDw3Xx{T08I{qgK4~uUA)AtA2Uq+y=ElT%-2#op z;B<4}l$TEoBpGj#$X&m66`q+T(+Bfz)$U+Z3c*gie^^woCSwa! z^MYRi#j9ow8JAV?B^^kLHS(oRl5)2c-h~qsi(x3Q1(55-3O;mQ>aJ3|^1wreh@Re} z?H=H+x1_AI8EQ%~!KVQS9P zRa{=dx3k|fE$UwMuR-a)rx{0et%NtDD*8X%3ecy1gp-O+DpFFHjuX&qEf9#xE<~ys zDziWL!N+_5kYfyB5j57VG_=e9p$9L%Hr+p8RYm3|UZh@i>%M8a4HG~A1aJsL(aANM zAUJ15SdeDLcLxxs0n85UbrJ8vYtmf2Qj-&u_402R`7rX8OpSBRn*f1XU4#0FK?6#V>*|2GYxrgjDaoq=%iYOx zhFSWz14itikv&rnskxLfLi>Q%fw{dD{u!0$(>b5s%#W>VXwvrtk8iM`uAXaJ5aiZ# z;C+9>s`d=>u}@xum+_g(=WV#lOCdL^q=Ms|`5LC?xv!Cin6>knMMk`DF{ah8eK4+d zt0`I!{TCph^DMVuQjIbzEp%M<{et=qhoc;twO=3myR^4nOiE=})wpJjPUd?dWWWzZ zZ@+xH!!V*Y#V1}=V%@S*wHg_;g}0WwrNhB#?)J9xPeL5Z!!|Un4^Of2>en2%NglE_g`GiQYa{S4G5tq0 z4>9N?$5Emz&bt0dsa@wpEbe@9ZE8R*FIER=7#%e@f zUY5G}kq;qR{j)%6VRNkq$sZPlsrn!J6N_^O*gPGKXwzjl${I!88Lq3d<#k@fFSzxd zQ#7c`izw!h5T##;e<$8HjZhz!^^EwbCKMF&*;a$3`6lnHjm_Y!!P=s%!AD;)_Jf6$ z!A*nmZ8pbg?4lt|9jk@SJZ)VDDdFzp_PAk)9Siu};Fl7+5OQt$!W{W$wVk~h(N@Lp z%B4Gd?OU)B{8xq05H19pVHIJIJwV?T;_GbZ19LF)^KtQT`ZF13in&e@rVmyb91+sk z63E?iCUR#*lf7Gp_~cLO2~_jglR6EoZQ<>^a+Etl-_N%eWe5AiXsXhUPmxZhqk|%% zxp65`iAd|}aCPBs61h0K-#ti8#|-c~Hq-LP?hF;SYIX3m*Z1-C@)ff8^nv}=mHoF^ z#qLnZJsZ7NVNl16&Jnp*Hm05&G8CK0{a#fgc0ggXq=wFUd@{d;w5bt8AdTKEmtQ&w z9?%(HDdi6r0P|I+vr>0(13Ib`TqTr*4&)k?kO*xeQ)hgds&jV=CKgnC`Fg97SM&IU zb;HTzG%R|b5?RWc8N>?MDEGkUnS1YOxoa*Sy(#wU*pDg!aCytMa!7Ng+1KJkjYOl! z)m!wsKlXtH6un6ClXEgV*e~vPt-ehDGKe%?xumu7<^H-KYRZXZI+|;5pf{%fu}NyJ5)Mg5XAWI)z+ondugO?f#7=vy41n)k2aU{ zf==K~MDMOA-=;jJ>4HG)MEiSC6UZob)}_dYTH&L$^)zjBLVRsOABX8BHz ze^db2GSSW_NW;Sc7WiicFm%rr0K{ia0g5DqMIq%Aq%QEbc87)4E0!CxW|6d;A=6S5 zku9{SLQxSc)^E5dLFnxli=_Yv7XYCACk5=hy#8gtH=TVSnfH|RJ_u7ruB#tjxH$8~ zNRiMaj>^@-8{nQ_@M%Iisk60QV34x<=qrL6k*>b$|GOKS(Vad2N%YI_GxPJ@9X?gBcgqL*a0c33&3Lc3dbpd(DfI3aD&WaeN{F2hy03eo%nEgxGXqsrr25GvrVNR!bHS>3jb&^ofSRXZulX}OZwmc z{r{Q|X93Sv?>_+fvHdQrLeAFmX93P8OFsZku(`xv?ZLmE=tt^w7UgV)@dJf|_Pa{X z<{M`beohU3$O8arScG2^g|p&kz4#9ReryW(^D@r*^s`t$yXYTC06+#7>%Wikm&4ZA W!pE8(06>cUFk`LvIyTAx0RI7F?$F=> literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/excel/test_converters.xlsb b/pandas/tests/io/data/excel/test_converters.xlsb new file mode 100644 index 0000000000000000000000000000000000000000..c39c33d8dd94f0a1928bd1658333625825d32d77 GIT binary patch literal 7810 zcmeHMhd*2G+YgGUh)wMTv8qOqnzcuViqV?2gA%l&w%SGQUA3vbsZx7{+G>y1s8O|5 zt5l6Q?K3>}^!*Ed&&lULC+BnL`sUo%eSNQW5I6A7Q2_`5KmY&$1xyCmnfc%V0A}X^ z07?K5S6|-V&Jk(nXr$$OABn!p=VFV3WS+z2N(bO#fB)a{KdgaRt)5$L0%Y}UpGc!h zxtw}%pd&8`xhQD!6v!>dXN1+6-tN@Gj@;oZ@`$Q@hv2vgp-Q2dJ}V7vj!Bv3p~+B^ zZ{9HDxZ&)(7V9?0&#qRS%X*6e+< z>qaODSHA2lR#h#pmKX^i3v-c=LQ|PX zilJUuex97*0T911SKfD_eGhAnH?hPd#xmE)0f|EM^Zhvg$Jqa2n*L+yq0s|a#)3Mw zRO;myzpvV<05huUAkU}Ccu419b_`WvS?n@?5ZQFX)k`%+!+W`-;-45 z#Z>mh-Dy?e4$P2Ob4w3@pcx-=;n-TH;f@7euD8ysw)!?Nvv`zX?A`C&EpIYFXyB)RYRwq?bdGk$$mA&{;XBaSgd`<6%f!QH3R4VJz&grdP* z1ETHyJV6rf#F_Mg&kv=Y^VY`Z;7|9MrHmpYF=h$O5FRE5b9^IFcN?kBo}EJ2l32Sh z=PprqK8$O_QqA>UI`9-^HLW_)&2|Fjr`@TS5R6RHkr!tv)iyw~Y!vLZn=Lh7sWvd%kM zA&u%r(dy)Ic(A+kN%#rIZoB^SG+JCbbvgNS{Z_Y*o{tv|ncVjL7hrZZGh4~u zh7z1Rfr#)l!$ByO7N(@-}RO&8@_DndAQ8nQA-%FWz-qStw7NY+OdaHGC%rn(Ln?F6`s`b^{pt0iS#+Hn0dM}OwT;sWF$JUp>g)eo^HAf=V-eYz9Z#&CS z=XlzJRXQVf1K3XO3~hx(I->t{RX=Y8|8`WUffj2lK?&x*r*9cM8YO=b=m^rBFp0ET zVV0?k!wAQRA+PUz-L>7A_>{0OopJmk;Py0ENCYa~Mh}63iw{)#Z1s$3UGyrl!HW!M!4HQ%;N@%fT^Un`jEwvF@$BWmPH~bM@s$Qhx*i zNG*px#YeIhJc3l)-C>b%Ny@_H3&RPRa=YH1Xf+h4r3!AbNlX<4xaOG7Ejv$$hMD+Jm8Sn zQz&|g&i&mquh?^#jn4DIy%~?sGsAmILrWL;*j5=A{3&Uw-|b{aug$q&I@nWRzI)sD zDC51jLApyQSIC2&yT+S_lSkWrJyy#8qOIVG*B*pb?+`(gx5_(}1w!{>@HRn%q%w9n z+*>jC9wetzZZiy;W@{BtvniO42dkg6v%kq^YEcm@8?a%174^R{1M?Kpo1| zB9@n>dO3F`XA7&)C+B6_IZCv*M%}93rQxOz0`EL?81y_yvB+Js)FFvw*7I;)o3@^8 zwmB|jNPgRRNauoo{m7kR|B*Vfkpj~KrHzYSVA~o&m!P1vc3&YCNxs9MUDJr#xZQ8S5pZ2QL#J!}xZ;r;4 zuDg^hc%7_N&OSOC0{gd@WNz=Q<$6ha9qc3xoE*Ln#-F+MCIie84MZ=CUwvR#4jjwL zG`{>guq#OyB(DG>rt-B9SxW_y%gO zbyx+$#sCH}=zHYM{R{b?JLV+|c2GxDl7vio)YOazLHBhOpC1{6u)PKqv2eQ}O?Z7r zxWLPzO+KiS0cj_fNwTUxIzg04p$JMBEuR&6N6DStx1JYu4mzrY0&j{kDcKfo&YMvZ zT{)LYvm<{qOu$eP?ZcYX1AUUmyzR*@$I#WnO1DzOOdGkvozuG#Cb$ewVjwEgzHuNz zM(aLB`jPc=R4_ytp7uRRkZBUBEE|-WNFo7wkY^@6uZ+m-eAVSo@~F92LLpV28Bs4X z?F@tudA=ljM{swPyZF3^;53=1WNnoVvM6ik zuN&%$wJgbfmnaC9@Df4oUi#$7%vAQ^0Hep(lRK;>DN0fJDLO5pZTPXuVAa7YH?MQ) z#I4ccooMeDo|F_*<{ZHV0ScaCTz=0}`=C}d6xK-l0L9_fOz1SfW>pJr-jc4*Q4C&P zN&K3-F|N`!B7lpzM>0F#&_Wyb+1A2MUm)zfcORw#_fDeS^%DIAPVI#FDY478k#D;D z*fUtO+%Av~Ua%bHRvwUFP(@E0WRIkC%4dA=8`2xhyK)@Xu81aaTAPwFyUT60p8VpS z-e_PnU50K?N^^`fqmob_6W?A#(5?5olPc^hBwpmmM3{jiIq9(X#ldT%5LR4HFigEC zu4iCa#w;89ME@18?{t@A5(V} zRC|BD8!f)d8{y*=UrdKv5WsHZAE-HLyqiuxExbj~9awyK55-0&rYfv0HH@I16xrah z=q*Fx*_7Ao#Ek1)k_SD@ao*Re9fwuF8mX=Fl$(J;=3EbX@9r^@$Y#BK)|<{jXWv~m zvW^;AXX?REmEDk3mD4<$8rsl%x5I3b0w&-e`PO3BU~OVm@lLxyrBIFKV4TRwg%p!T zyydg3bwBR<{?|yz1R2RlDtjDBjpAm=`j|JHWol&n_w>c;hV*&D0FAaS>iPn09^pf$ zsyAPctY0lB+?c+#1Qn(sS3*i3tq%q6sXIn_dZ;fjuo!19U97&rMnj!=(5#__86|w` zfc#E)A<+AyWZvzAKI=-&i^7E)k6PB8=mRVeYgDyW=`$P7cuj-r(vB%+_+O_+n+td8 z1N6jz=M9V5SFSt@sMXGY(co4$_<4%7(iT)C85hBe&Fk82_E@lC?BAiDuEEY3ferJq zcmTkKe}($fAn&2s@T1exSoxWgJ|hUfTtHR{)9I~(iE)$(dJR;eYUxnJR~t%M$9+2JiX|w#qP^o&-o0K_iLDy_GMO3vE#Ew2vaZrfggHWpGN|1~}3W_Z(9!sc3 zFukZiOvO0(QD4&u*5ro1&L4?f(5g~g6s2D4H0|dGzR^V3oPU^+j5}DW>oY00*2-f{ zsJcys=~cq#`JSBId?}qd&7)tbaj^m28`8sE_vPGr4N)rYRXo9T$cAYxj~EW#x+7Ci zZ}&pX8ob#uybdU2lJ@9B&mei)uKQ3vhTt>i3w^S_zga zC_TPW%a=LnBFw;Va6I-(YBj=?)9Sd1AxHYRyFjfFIerH_)t)+l3ARAHkb+Gy<2gfASk^K;?Tru)|tI3O4H|#}=mM z_GSnN`}=5qGkXW*pN-f5>QSufJfe-%`~;}lXH^fu&j%_rfFTsd!Ox`jIR))PI;wLT z$}A8nZ^Jg;>erI!3Sa#?gZ7eIez}4&(w+gw(uDPrC7$cV>!Rl~Ps|u~*qmimW8e`h zrYE8k_g1PrJm5jvw$kI<7u=Jj!2|O& zk|JCwrLkj^?}F7HBEDVut&u*4N~;TDXMWVk%-Hc~75ry)f-MhC99*y2nIm0(RwqNz z#sFM=NIb4!r1xVVsmN+dp4Lup?^>DiN04J6COi}b3h~(&B2z<}VzGY12_nKvkSAD9 z&|p0z**`g9a{vB+l=z{uKaS*RS^KvFpnxsBuPmTx2C-=b$fqdN>dKcg6P$S`%R4z{ z85QP1MN#YHY86rKlTK4%i|#H53fHEY-Lyk5bP*T@?-o$uyHV1w?ZvqiG#PkO;0x{( z7;NhVvqjg8Zq4S~g7B1Hswh6$)(Q@Holv=!)5dHSXPy=d$1salJ>ZvrtLI1BQos9A z*Dzrz413Z&LfJE!n2v=zsGoqJ$+!Q){-uZ?-AMH#cCkfPv zJW3pMz4U|@()%>-0#t81`F^FXUCtB5h{#bk=#7>1vN|IE%YJciFJa@z z?_a3>wO@Z-fAe|`apPBjU;Fxh3I4d&VoC9jF8^7dy-q*G!1^FM`88%t+J&o)9%MSZcl{?+mPp@!3p>1Y4;Ea2I`!YQCX z=qKRWF2h-Xv-#R7fGxI9@#_B*G%D59sqEs1_1t* zH=Gqe>%&h0AlNMQ*JYgb=x4Ehb+eVT(_JHO;A2e>003b>OjzqxWB76R Ef8hk^1^@s6 literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/excel/test_index_name_pre17.xlsb b/pandas/tests/io/data/excel/test_index_name_pre17.xlsb new file mode 100644 index 0000000000000000000000000000000000000000..5251b8f3b31941aeeedd0a45df83d2018d75afe0 GIT binary patch literal 11097 zcmeHNby!s0)~7+bMH=ZCx<$G{8U*R?MnbxgW)P5;2Bk}okPeYKl0j9ewY; z|9;QC=XuV|Jaf+aoxRuot=MZF1sP}r1@DLBkLnBlqzJL=q{i7RJV4% zQFcEwj?P;&9$1$36UIOXPQM6qD}&O(eN>09<_4_A=5A@-?&cZ}j|9fmjb&fhN>h?k z(lr*Y;^ZpOGZP~DbV-r6hCfw;<*GiMfgp;(7PAT`X_^;+dxyQ@=M!801S)K3>4CnD zy_s`07K4+#$~OfINh;Ph2@QC+eZeZP#3{@9#ES zF2-qC@Ztw_&2y8cTA@Pm2%t(*C;JOBMnVsDn=nO7R=|l~Y#+_^j#YL((4QtoZ_N=`(~g?YLH1*5lG>$YV!jcAGk}Yx*xHf=;pXd_(mp0}Y}dp-U9= z(x8MwC>bPXbLjv?=ISn7S{KvjT z1Tt-12V*NoCdTid|1;_TVmtm}>fylt?x-K3W&^2&y8o zC z8+5TV$xD9@_eBZ-YzFTPX|_#@NR@ z_=b!Oae7E8Y-o|VtxUx@pS9N3Q2=wf5>ELjA}q3GX%8b)b<~9P7Ng?Kaa&60L&&PC zE26bm;}cS$PEntpNL%xoYn#)}_FU4?FPjtE)_g61oRp2-L4q{VE%YG@n&r7ED(Wfk zKG0b!A)<5d@cU@#n2kTkt7uMF3FQ%X&aENF6anQ$yh=rZJUXOzU=%UU@kFrV6jK%- zv!g^G>Y!-YWQg+ha=Jc{^7Ru*ymFB(;XUXhRi;VZ;L7*AvlTYG`~lbz9?AV>SF8`i{JqP+z1B^N{+G|)GQUOy7cDoQ{`SRcE?oBEwt zcYHjlU76yxv%T59DMTm<3^LaCP)on@7Dbx;qH6Sa4JE@a`3(&jYhREL#@yWI&Mw{i zj#4ZQjUZ_?75FGxj-LX#jHIq{Gvovl%T^HSwj11$uoEI=(*~C;%%3bkK4Lc;8Ve`B zpR(Z`Qmd3(aucAu^66xI`^oA}nCMfZvXR+nz1Fu^-mXNxyeXkHb znA5Bqj`X{q0Zu#{yc&r(-_geD_42gf?pT}FczVV&6_ZHS>~Hop#CZVo9&uYzlo`DBSDMMu z+?KSmu*k$%6;6qvJ$0(?M!A;vDfS#QNM&ky7o}t=C>6G1#l0c&OoU^>%WvEhq+IiD@QB{V8 z?F2b#ENs_bz0uAeKY4ZLjKEw z{NW?-mgUw%TJ<+ew=tu`&DYI83aFjl*cB2@Vsu4+5qARFVKfDooEQybJ~#WtseMVH z^#+Aa>$E13sgE*Wo%{J?Jls6~E`x=uZ{PM|a#b+plMyw!YnqJfKN8HkW=%;OqZGW( zl`JbAOG?8C)8tXwU@xoeE#y(MW?_2Wx+FPKx-58|tnA&e;wn!iphgzk3A@lNP|^61iU+$!E!jOCVzIx0DPfeMD>mn!U(B zQ^iL_hgC_O5n&i=uavsz5asC;<+4g zqL0n4r*6QJ1SzFX>!|mmx`i^pZ;ivArE49ueTz>*To7bgzCqP zEh^9~YBr>3UFiYM7Uj>`I=NXH-%c}IsYe=G2)!`X9Qe4sTWEh5 z=@lLcscYElENRkfAH_kRhXFLM!vg!+A2SX%(_bf}4J>EBp^+|3JBSGjTWlF|I#U@v z9&$9YqcwmCN9B4MjNxnk%;k;YP#X_}A(pC&0y!N^ORWX)rGEhemH(ALo=6R*RzR48 z*z;;wF{f=5pg)}UN1Cwh9nz@>XTY^(+6;-_YZzo*Q&m5O~PX62})>1AQvT!Hi0v&^-MGNN2d zhOfx(ttCxXGOMdD>A!dd4htWFqqrxIX|EhbkOTC%8+{Tp#d>!QJr^YvW zYG&#eLf^2nhzxx!rlC`~*}r7Jb@y~st$tIqQ6b~Nc6cd1xs!p-HY&#bTMVGsoW(iu;XhE@62B6eR*7L-FP#r zTAv||i2-w5_N9r4Nri4wT_c8{!O$&{nj*+Wnek!aydn%emT#1gZJQc?)zp-9fcbU? zosgty)pLkNv`$U#KHrMdD5hQ*rk=`V3Y`8}}`IZ4GJ;Ye+=oOLh4pf(BqQo?4 zEDSjH>#iI}OGQ>X+RX&C7|JqcNKVj4u|qj9vHn3Zw0`!CmHcMaC01*fC_V+O#dCXS z-%e7}A@%0qWr`TZ(t4X;G$bD|_-uEw?&%b%T7Ec(x)}xb?by7Zw#fUSo(pFrz$tTn zGWE>y-NHOd*PH=z{|HFkITY-zCt*Ai~`Hya2&%-S085X^-7Gib1uIup;PO3_8+7slx z+)tO=pQc}5&nzLH$2ToK^bjR~I0$eEICX=me$ZKy6kEnk(oWMhlF>xAU#vItGy}Sx zqq7ikoU-_67tsu$+O$=15n0@-@EN-(+JiitaGr6kutzKh+Qv)+7A?xeUIC!>Ikd(z z9PORanca3kwSdftN(9@0}UErd|Dlsr?H-=h&XI=ws~CM=|q0pF$*+$d zIwX@E3lmbRPLM^kK0JkXgP;u^Bz~&8uf3}l;uXX_?`mGNr?ePwaV%~Z3>PUK1~X`D zlKC)x{GQ993%9AVtZagB(BAPuNQd|M$b-OQugrq?vmDRs=u@pWHA^AY;>P-3sx=~? zkyy=7yW;L)~{d5(XaJF(p>+MDaK__|nknyDy)HlCH4I@~)#3?Q1ydsUW^#O9!! zRIA-`%Ko4!GnWls4M?&GMCad>hJ!?9M4uQSZ61|S32a5DW?meB+)EKnyCs+zAoKdN z*MZVDBG}QtoO(2#zjdOItZZrptFHcSLPeHzUxyja>mxfVGgUV3ccr8Bw92JB&jUuQ zluYB*xZWn_u1oej8Dr!$ePQQjgU_%!8&C`1TnVG2<4+F_z?Tzw9rN8^{W_D)HDyF^0@hpY31{_l+ z%on+M^02V>ZUd*0gQ+6%9ua;y-dXI($$b`S(oLnPq}Q-^Pgm&mhp$HrI8_@%Odhd) z2Ke!QNm2ipfb9HD@L~2;2Ww@4zD^EplL4T6bI#0Hlj)OBsErdRKy94t!8He!|?4pl9x9C+rHb$$RLqLZW)hSDWZXVqPTL$K?h^)i_nq5_1gSAd=FAw0u zVqjA9Il1T(d6TpN6)Vt+TfeE4eBO`GsXA8B)!nYvyX;NSGi4qn{TQYe&j zg7K))z{T5;-f-2*j~2NvExd^agN&N%B3RvY!dxGe?RQ~)q#G@AH8+;BqW=lMFVP|$ zPEuhaN`(et(!biMd=J~?#A&abLE}@Gk&2LZqIy;5a&&3TuCL__+cO1(0-Fy~Q*g+; zM%T6d=CBdakC(BxdN+)|&RSZ|r>A*XvG^<6kZ?jjS-sI}_uy0Cej7grn4XnqBqSHp2mb zUd7SEXPmW>MKRt-C#6f4b=4!I=##OOh%&?5q=rz2jmGp|7#*EwWCPRtajoIB=D>T6 ztC)Bu>lp>mG+y+B(Kgqxzl&|fsYF7}pqTb=b9%PFs~%uEy%UaT_J$vP##0RvJ=t9A z(!_g-q&w5e$;Gahrsp5^gO1WTkHo~XO8u3`{mD)-A2OlNGL^W<>s3C4{}RyMEQGjr zaBAY3gK=)EX>+PFtz{nfh3)i{Zk zn?{-2x0bM0``R#XJuLro)e*4pv*Ynif%BZsLqi@a)JiU-9^ujE&i?%jOUP~RiFfH8 zzu#s)1xRJtfHEn?f6nP|=k5WVW`tJ64jNA@kSkLq+A1{b$!UYGJYZPlkrr-ARj%0= zE}n6as)yaW$@{2AWkPwezf*OZRFcYrtlnY%w;IqsvpP+g z7&))8gxU~ZKI4jduYfsz?38!HxKxfqUFm(Z2%zBEIR6lWZ&H=(iirwy7m}6gnMOv3 zU5~+Hgwy!eigvzynQfFuPPi*WWE;;Yx{V_#5{!YfC%jUOfIPpjsaG{4 z6Jm_oKrItV3s!lcOe{qL3Q)^A1(ETC`XXyLqr)V1ca9ve2TBU?RMKG*1TbDOpE-n? zK%5K9yOnWeXL_lT8|gJwYv zN=;;?MVEGqH>DlGoglOKQTtvF@dK4ZT<_s(g>Y!9`_;gnyR4dqiihX4eZ`#qnrhyk z@}+i61`w0++H(!A#^p&3axDI3U@ZN(mz*_udeEkP2WOhrS${2YZJ8lsNvam_hOrzl zD4cjxipuk=Bs9!Ojv+0VSQ8Cg7u^xSFEFDwSk!d)90G$BnZ)8NA`8FlasVDlbOcSC z!oZ#i?~5*nX?0-b7eRcR<)8NAwth21uS`LMXhST4p$OT1HCq@bmE zKJ4_6;!q1K&a?Ve9HNu(38<;al6OdnY?<-KO1+FQOSP(`P1D;|}Cnc(2? zA%1anJZ$${0|v$bR9Qf}kq4c`p!3h7=byDa(9T@n!A;7>$k_F^qBjK8gMfl1jfY~1 z_6hQR9L+^S*V5(V^FgqzR*<2$tBDyaq&*Q2FISlPx9UEzzLr)6*@`)6Z;Syd;2YT* zDmd8MIWigAIvD@i!Tw*}9;7-CprTAGGj`w{?2Slwg>_P71fY`0s{B4th#dBuu*3Vn zDk-VVnd=7gx>3~j&gGPgl@ytwuVoB=Ow4K)ab%e7G!X4o(H03AgY+|`c>}Y;K!wpPpaY*<>5W?XsYyfH<*i4a z;} z7GUxcw!&uTtRapRH}6u2;66ePQd-X0_1NS0MT5;kF#LEs7Fw^~4<33|2)kj%+T-=N zxYm>1i!kI(^J4GU?d1}x3p*rS^j>73ppfOW7nZ=120|V{bxnU6Pba?tRS14HeLzhE zN*aiZUZ9#4`j1@Hx3l{n6+s;Q^GJ!Evz%r|@0^3bAPJu(sMV$+@1i50S#RxtlA7*x zi0v=SXLvBvPI0kC-APea$I>yo+nG?ppBFna(`c^|9AmW~Sd+H$P{dZ){Hhu~Dw*ji ziI_NYBg}n1+(jRX-Qx=v3mlQK7=Thd10PPNiOqc_U_lvVQckseDqR~KcaA;1h^mJF zS@SmjF@x>lrP^ZJV1l|%m8l2|HzV)0?l9}?w5T-BU`|NOlLG5hjwW;ItKK9$8kH2! zSpp~N$Yi>~qQULV75~0C!aNJ>rUljqTZn`izy!Y+g6mAyu$(|OA|_sTLId$r(+|}I zF;@^R#ChmMqDX02YD@$`!|i8_`RO7aIruSB-4uACopyypozBH;5<-Z%Ewg$MQW3O^ z9pxh5bc#q1NkY~P7s*?j9?dJ@k1m7-!%@F7osA{E6dT?qoW6g927KQs+wp>}iH&?8xn9Lw?%yUy0ag(eF@L|?S!+*}eZRm1#U3Y4DKlS)z zhv6nVPlEj|t=m`bZ}&l~^DC!Mn%#?eKvz^j=f8C3w;9P#&RpX!XO8^bH`CGgoCST2 zjg|QlZqbGjs^$l%D;^ux5@fX4+V=Lh233Er8pr(pa{|j zb)Z58vfJN1@SzZ`A{Xe&zn$f6(EjOx;IpJdhX>743&y~k3NENZkT32f$RqqYM@T3D zs9gQ~*RQ@0?8oCbFJmdl{0#8(OCCQ8PlG(r53hWH3;x{o{!tJb6w&{=2M&${-rl@J zGPw75-2bbK4lWGd^1BlbL;PF#mj)m>3V5ID4uudzuAftn-*&CQ0m0iocYws8+WXId zx1+nI1UL$KgW?Y51L`fxFYO9&1n{E&9YQ|Z9|(W90Ki4TYuI<9*ci8>;1z9f0PsS= z9l$=wOZ=>ZAC!7mHUI|%-*?{u9)h~PKLg$>^%e>o1$<|ChZ2tuj`G8>?)Hh`fZ!Xg zJHXeV^8as!bqfWK0-o{Tp{Nppqx>+eyZj#<5Iljt1AGc{!9NcRJe>x|`Z+(oLxO;4 z17ZD=ErW~yoYMR#u0ZyK_@4<5xHNe1xRa-#_!krTt(L)K!W~xPzry-0k-**k9Tpz*EODn~M8MlD)Y-0fDO2Gh8+4S#>nHlb#e{Ph&YeqDXmPXPKjw-Gc48q}&5EFXyN-3R!6vDJ3mrA;V;0 z;A|WWN26ZH#=98H#1(H0dwg5F5mY1T3heS;R{Jtm0N(;01fR)yEpX>7!@^ zT~v>(g$*ZZ=_{kR2?Ocj6jzZ^db};X(WfC$OaJ`9fr> zL=lChBSzb4kIknDDohLW1433sr5+df45YvCMB>-V(~ad?|{zrr&wl ztdFYnatM*oK%U5k)=cO8UKmB%1#*kGmDSix#!ObpyOy?PL;&FG3K5|6i+4--eeU`S zH(7bO^J2oiTi*d<<;cbPMm&nKg}f-oo%$ zaTghE$ZI09=^uVQF=oo=ucf@$JA|e;!3M|j-wQ~xmHIg=`x)kSX-=u{k2IC0J?xSk z8fJdQCg2e8)cbjz_ga(+Rao<}GKA#D3Pl#mo0ppBrvD(I|>puK?O1j~Z-lnEb(ED!|w_bO2o z3n%)&q|Sd1U;5wuq@V4!{2h1})ItIP$N*@!+^o3%;VQ1S4wi z21VI;7>aeh2JCo0hYGSpq`vJuF>8NYSsEi6Te{k;hN?ve9djCl>0>j$;-4u6F1oB> zO#7_+AG%xXs7JKR6wM3sUj`swRqs9|8$ngxusA|6e0eIM#b6t8%ZF^$k99l5s{3}oekS6;f(VDQ6+ls$1)O_WIcfHYU9SX^(CN$LM@_uiq?;MXg=!=z3tY~4R?Z}+c6>jF&*JE%d zYn^ysVfJYGpcm0KOcWaa7OUE&K8`r-5f5Ly%hFR_i&k#z>pvnwlSKl)KUl zTd|+Q)*m`Q^5I~~)b&JA%=u<%+t&IZ?mEgf4Sd@P zqHkA+UvW~nej~ zyHaZ|Nl)Y1qZ6Jj40O*mTwHUVzI)e~%H!f_%ntQ;8sL!6=YqmluMXD_$=@_^y>@tv z26sHf>uyo77&B8kpw$!4%nb=qYr=Til?}Z^)w!1M7dHPrm`Y?z?uaq1SwoTaz0^S6 zoYowFEvc-F=}B_p0=s9iT{f{k8x?d(}WpqObMNxmB>tLtoUI z0mt0X$H(5GJU#_>xv$lTnzWpDE%b;KyuOXdVdrNK0aLzBYcB?!@LNx@4teWdE_(Pg5X|l!Wd&4nOw(f)mJ*F@TP#}(y^}bR?v?6u*sh}5^6%@^g$(>8unXSg%2t$5CZ*xO)DcwK;IF72 zEEZI;=HYtN>pj};wtFr!(1J~3E6?YS>HdX@dy1~6|M@nF?qrtZPKxxnJI?L{qOUs) zXe&kpP|qr6E?o|&h4eRS+scS+-h>qZoQ0RQ0zj%Ef95Mxy*2Xh-! z#~`Pl1L0FVr%L(!#aeBER31$Mm@2(wQMuGdBnzFd zUNLaEJ*X#FO2}0nX;smB$@$}2UCP+ryOo1McV$<~laFr)guLGJ&o|BwS?dxwHxdMKUso1sL;obj6-=Frk;Tuuz#E=PtUaK^$Ordn;7@>q8k zkT*v=qgFM6d(JwB{dV)1HxI+=HH-Z-B_AFS*(QVryYN@brY zB+3cmw3w)uz;a@f^jFDbpBUacmh>mmkmNwNQi%=}`;J~n6v&WuI`rAU109- zk1YImUj*uKQ)I$}u?pFbX~@q!?PLbAhH(9S=lL-W+17$alHUcsr#pBMzM8f=H9)U* zSBD|yEhfymh>>G+Lgq-Mduxl*RZ>_>a7zc|B} zI33zy^F#zV4wLIXY+i%z%`8;)B$R!`EpQJ198jKbp1>7ng(*V^VjQV9U!Xd|aGnDT zgms5hqTSnJc)ZPaPVHkyx^n;so<=xlo{>-xUi5nZgq@MjOuCN&)PvuYTrMCXwIz~M@Ad2~6`bnl@= zS8kU_EBDf`zI6~HHW6iKVo<-vW14jKGK3k@ge#y(o4yJ@NY=takmSb%`r7($apLkX z9y4Sy>XL7@H4^#ea5MwX{O^?xGPTIwpcL;X z>{FVh7oo<_Nh8&C?|pi`!IC?$`N}@}jg?I&w;sdB)I-q$^9Nb|N;2Ik1M;G{_(rUi z9VR}$?G5O2Ov)LC55OkeQc=z^_JrRe)~{R&e{8r&!vmjtJvuO;bzMXUX4vDB1*YEj zSo$si!d=YM>>GX&l-w;ltI6{@9gpe=kv|ybIYaz7_tC0HlBQ#PYk@L|+#B=$(*PN( zlr*7!h<&?!7oC_FOwZxaMSy1?xHB=yRPeFcSU8Spp6HgrIU{}SkQ0GAV(4t4GSZ{8 zG8!`!0taK;K&g(_EY6Q*Dm7gQ`72s4D5H_j3v+jbo*_uvS&0K9rP4;#$|~7nzJd6f z+1|`zF}LMKSfLd2!SHllaTLQjQ!UndtEI&gZG_NCNwqqn0TK2fVW};5vnKOW@~v2l zBT42;O=fuT+pzrItG~*lfqolJ8p=y*?cvVokwu`CyLd_v@W$Fk3SJk zhxBatwS-g-d=79`B6xc%AuFa(Ojh6(2e>_#BPc!Zg|vzUeVqd`(x+60DS(#yRIk|R zG5zO0q{#XtmmDccA#bEW#hUh0jU;^vcJ>*BQ5bGF)8uQ;B^|WfH#%%D0k0k*q`Q4E z=kGy!4_+0oO?yrr$P)APxGEz>!dy7(5C9>PB02vp2KD_4C$pk4>!-^*(&aRp{UVQ8$;%~o z53S5wxh(NxaxXHLYbjcm&H`0nJH#o4s;uKMjhbNxm_@@@yj-fw3Js8$Dgbh?S!ko} zHq*NSP9@{>Nd|EJS-y*TXFF|70=?8DUA?SWwni>xc7&|M)}kY_3F$dD{=yQywvFno zz@WVvOiE~(vj^ou$vqET>n%;cRE-JOCY$EPIPaE4HwLkmdFJsHaL&rO_HpT$#~W*- zXUhd%#udK%V7>FJ-)6MZR7$b`sJi6)SJkBxk>uMAKh(@8TGY?vbFIr0(G&0N%q1t6 zx_vi2Z7~QsOamQCNoAM&t4{gToDk4*;mmWDxhg)ZrbYc2(AOr0v377`;+BnV$+kgO zm6Tg9q-i+_oh;O6$#jVhoimA#F0a)jP9Ax_B!2RKkJ|d1b*mdBiA8`}Rp3PfMZ?oe zqrCN(`2#hF(ZvIfM^^w<`(K)@S+ikUKMVp4o;2h#an*x$2{*iBp2BBRwwDsPqA%VD+&kD-@A40mS-7)X zw^3|OFo#9o$ag+CU>h%@RIhZXco!A@XO~rwq{T$4*Sj_&M6(QtiYMo!>G@r|dx<;uDH;XN<#tR8 zG=Z6Yag&k-s1C924&7!Vc4xuilZd1d$}g%Y=_`iY^bPO@eHPAT0U+#!&CFn>%&|JksFN9iyJ|oXS6qUw63U@^KX2Vm&6qR-BawgNasSQBc<2dPV_mr1^}qy$y$7-@w#2G6c)F7rtv62m^D6J6@JroK?`qrnhvvWRR}C(G zd#)#OHQ}7(X8*)B!n+1C=+H7(SxV|@)9;bLkNLZ*PKH5d;T_xpl>c>`{j;|Uj}ejU zcu3N7l`sw($&f-KdJ8m6MShZ5An@*;a7cPKhKAE2HYG*Mv0Qy zr|a;2de^TX4^JA`q9p?5YED%!U$@DM0$HCF_7x0?$>lE?1*%TG`&=0-Uv?sgU9VnL z%_skTeE5FSro2JMZ~+@q)w;fn7^USYM$8&o$S{b;A20bZ`pn>+=7c@L()S0|`T>So z6bi+$^x|~*s+pk?jS=)DWDRq&NQsRIPc&XX%oA_wrZ?=~HAN{m+sYnM8CEQlK}CAB z;lE(>F>1CO&A2sO&t~kA$Og9rp(|3M-Aieh-{@dw+%l*22TAc!P_^nu1IY*wN2CE* zy)=B@c&_hIFoT^eFIswznRM}Om;$2_2oaZiwNsq#IR8s;ZdzK@8iZ@~K#!kKku{Hy zfG?9Uv4!x2l+ufK*7V4)i}=UpHbkKxc}}cTa#J_o9>~=%l!U&_mCx9FlI`01W_I`I zZ+(Scf?vKVX0((^@A-aPZ+-^yo|Ac=X{S18IJC4^>oeH?ETR_o_$#;B5{K>s?k}*o z2DFQf4^0NkHEf4b?bN^Z_#u`@RYLH8)Zqse{P<(q_Qzle-b@-exIeNnhPeIQ_0i~O z00iVa@d!MxeS@?_Ue^(_w)gsCIlsudcxct%+qyW^+1mPHpw*5fgj|b+B$t;54*8qj z!8lprl^Wd2jNx1hK7593Y-^jBVl?iWv9-FrqE{H$1nv}<7=>fc ziAfC(2)-r6%we;DsoUBs%%s`e3KlDGCDQy*kW-I%b}TC?LvKn~c{vWjUBr?=Cz7?~ zPQ?HIxh8|{&{U#KJ|(W)ddEblJGD}fymQ2x+mIimz-O5y8q}}2f4+KlE6a6 zD8OPnvh(!4v=+(h2I>GGyVMwnyq`db=({!OS!1|Frclsndi@dlL5KTeV*6#BL_9^` zVyO2eSLN?h-bEWGPYm1?rQoC>-d|18z|QV}miWVc|M5zR5wo4+#tPg+JfX&dkv9w~ zVZA8MlDl_QZg4Bw#Z)2JD6^72q&RAOD)LoS=Z42@#ImQ`sr<;M&D6K7;Bd|$ur3s= z{II;j#Xu)CZ?LWin)Q%36C{d~-$|wPaU+s?oV5ojk>Xh!{w=l_^6~f+k0ByiS{Dm{2A+MH%*r zX)eJR9WxX>AT=+T);{>LRF{)wdD+Z}UIO^BPEBohN3_4Q(S5(@wHF>peC*Z`UFi}J z?W1Qbs|YY^y}1ZxAM$`(c?8XULr8|jZ6HwaEYsWK=Uw0ZH~^W^RwTi}4x6cyyD@3h zSh~7o!M^wbfpVP@bhxKgT8Z*|c+k8~)SxKWPOVx-&mM9C#I!=sbnU~^FUafS^%-MTh)JER?$~v&s)}$LVa-3j_lWI*2#3*Yphp-mD#z z4NuioIp}@ikT(IR?de}3|1PFtzJD{e4Bsn!_}mT;euhVKBLgcV2{QvY9S0BOy3slW z?RW5yc(R_S^|5V6#*@c4u{F3vn$)oilY$OlA@+Q29b{?T~?xz%F%c46esnMWJqZ7rA+D50N9AV{WX zmoE!Hlow(V#ew_8`s549jJDApF`-B6Iw&beN719ot;+OJsa3v8W$i2UBkC&7@?$_{ zU<=zMJ>J>*EoI;$M^N$~<+h(S?f)ycWqg3=wmx`nqX^OYce(99zUxn_=Ktrew-7ks zGvMDpo%Dy-{(1V%vr0;GzXJUF0K}h?@DKCw*!Y*nA#Mu($_W2e5Cy(3a69}L{P0bj zo6O%ek_q~smB~LD=9|Jd8M13(PmDi>{~cd;6YwVCbPebYubh9aldkEfnz@OrO)HpY#Z>ExK z`83*}3;89n+?2i<0I#LX;nel7%KVEOe!DUM8jF$s-(dX=tx9sp@Fw;LIY9`Zfv3zA IxC8+3e@VK02LJ#7 literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/excel/test_squeeze.xlsb b/pandas/tests/io/data/excel/test_squeeze.xlsb new file mode 100644 index 0000000000000000000000000000000000000000..6aadd727e957b4b0aee69ebf8609dc926ab8f3d0 GIT binary patch literal 8567 zcmeHMWmHsM+a4I2Ap|6)8|k40gdwFtKwzXK22i?N=@y9*L6B|)2|-dCQDP{i8$r53 z`Wt;8Umwx;`@aA0bJp6k*Q|Z+>#TF1>yCYG4P_u22>=6t1pol(0p&TKRnaH_04F*C zKn%b_)t9libAj2p7;Aevz?==aJ#200GSN`k(gCQ*`~SKAi#<@U-KEybOIXjiO%VHv z&21bgoqwM|gNa)soT0a*!8TL0>a(lz@uAH?X!^=^GVX99Zwt4BkF+{DiaLa)4nk&z zQzqDYpIuCrC{9x9vk5qx*@_c7pq(R>F?g77gwNiO0a=2o63fKpFWAruuPKF3c$T{G zJ7>USC0Y$U3P_#sGl!Qaa6j9iQ4Q)jmr7kwSUBA`bso$p9UT?`P3(oX(wf43gIbSd%C;`@SmMTsZeblIo zqc;E^x2(EQGB#ZB4Ja*gFvf~f$?mT0?B*0lI`)a#30)_#%1dD0ziDsfR_vT9O+JiK zs#+@)25xUM!04mp>vMQxc8({`4B6;t_RLmS?iw(P)sgXT z4tx%2@D}*OIF>o;8c2&|;Kc%LL4~A0k&&kWq?@3}a&_n2j?J_qX4F!uP*8 zX8-o;QE>=l_<}k%73yUcPFCy`urey^=&~mXIS6K$?WigrXE8wz56oHf1*)jeHg^$e z^^-k=`ENyJua^Y4-VQL$Yu6jTv(;Z;k_u^;?d@ZWzAoYvWa#_ojqg&dCQU@c{%zP* zvg=>8b`W8=55;eDwpDaJ)oGDs56zHK_ezg;)Jk|le*OT`(5LOKB{h#glvSf=Zglir~?@=SWVOO{2q)&%ozncL9CYF=aK?VeRrD#XKo&QIHyqr5twe&hYf z`}EZp8yX3o4;kkXpnQuY(?Pbd2y!PaAp4blWNAmPR!^AIj36qXryanGsy)=D=q03zJ`kIbl8>CX+k8RlQ`U!$&ZcfeAFm-qorGmn-Qa8mgSj~Ke4qHg#gR`80!K=Uq@xkcXjh{sR9+@M%1(vrYM&~X3j7@L z;!)X_^oQOAW~di%vAbK!?&!f{vh(7JYc9nV9c>A75>vgK4b^&Dv8ua^g7n?_wQsm~ z8b&;((?;F2=TCB>L?%mjy$lf8*cqwJ`KTeu5LkcHozQi0!gRBK*(`m2sCbVkh7T%9 zE@dQvs0$m%24(cq{)z5jr*>|}=@i*@6?Tz=x7D}cb0n5g=T2c-DL!9}xx=0X+2$_E zZxy{YUaVa{6K>s?D`M!a6)dfln~Q#XYtFVvi%Ei|{*P|i95%EYMXucZ&u-!Sms@&p z0yGGN5e9O>Oa^ElbMI8zjrJ-d1_JdoH0sXtbyuiC6KGuCfwj)ADqza3=%wzOv?$yb z|5{yLsn$C1i!S8j))PB7rzKwJ%d^;Lo+Ho6g`m1 z^mmqoe-&wJM!Mq~GFTXYW68g=?oY0K&$%{;W~DY>LX4TZ8L^<6@#PIE+2`CIgkDeg zQ98`WFp`sE(C$x9O}MnrOSWF%fej9-lX!ZW^YuhrZ<3SdNpzado*f@=q2+25sVC#; zi&Qtk>Km!1JhH}=V0ZGh=-=BSLsru<;mh`J93fhPZz7 ze!`NEr~gQZUd1OD-`X`gbxx3hHqMuz8z&gSl>rY5wumNifm zgyTwwf~j6pC-Uw;xYYfzqY?|!o%%?I;r#{Z;%Ni>j+7lIrVNxK32vOEh%0u{#5#Bj z3E%8cfK0aSZcgm`B<=n;oOzZ~E8I8M z9J?zkukiBkv5D`PfS`}0!M!+3P?Ja;SYeC5itI#tvq=k`aDuWmGoA4kDtI+EH;LFP zc#R;2XF>bS_}TR?Td$&i_EEWYjE7{C&{WN%#MSx$=PjEWdq-P+^i=QCAuYT5Vdvw! z;4J5GYE?Til`<1g@TZE+5CW~e+iKoW8>p61p6uf@LM{&d!8Z4t7ua{?VBStezFLAk zc^QZm&wSQ;rdF;KO^>EK6gpzrxaH&bwA_g6Sp2&i=@amygO$qp#0(h*rq7Dte4`SyFeZB10rLsa z#zEaPAylTTY9Wc+K>~s7vCWmm?$b39!gpj%10WKy$<-9HN;%2buk#0)aPsWJnYQfO z_BQ7Z9!=@STeRt7!>U18FHGh9(~7;JORf@ATZ(xSEs6^WTrhtzXki)OnsVJXtby zU69Oppi#)uIqTztk5oxqIt3`BX_YrllU9#pCe-&QLuCNytbEk&Nc#k#Ea#eW)atoXNxm;F;_7 zqZQm&k{1c$w43*!WBWurqZ*)Wzox;q6tZWa5E77ENKv0=nje<*Ru|FeyMM~=wSW5l zeXAua!6OmRy)O&1hdR#~J%Ft9R{S8@-Ocf+dX-wk01O_Nz-iT)~t^ z@8R^?K8iO^z3I8#{;u)i?g((2)I%fW@V*LDt!i|i#FJ)WnJDWU6k)rHdyFd7*w_5+ zo~%SiT$@>Ew~#Q1tX`NT?i2TET8+nyE^bgzA=&Tt0Y^~Ja4)@V4}RvC>{UY4dZx&K zpw7(#n~>-TeOogw;wU|yl-(z|o9$92-$+Z-PpZUC#y?d_yJ@Gz*&flb{;{M(>D& z&C)(dL_?Fd^!4N<7`?TwBsfwR&<&Wih^&CZ;Sxz=sHa@$XDlHPKjWQnCwFjbvU$x6 zdg3}8$!}~0x&_S-g|7EAahOfVW5#5XuRDGY+;n;)y$5bXVa09reL}=KVJRF?=dzY4 zpiFpX!605%uT~x3*uYL@Vpcy<>xE;oLY1mQ3Qb&PB1J)sr zO~05&+}f4=h$H3|+t7#O8G#o*p+?$WWYfJR!xf%l{Gg|Rhy zGo{~xDzMg)o&XM2)4bL;fc0=iDFpl4gx)>IKr~%ijjQ_%DJ^dsD6N>yk_u>);ZE2~ za^SU(#oMKax(BbJ!%KwYH<)zle5^u;_Z%KhunmzHbu6kRtG~{@$|*KF_Ed%a>j6vr zoyyzF2pz%Pq}(svI@K@Pg_fMs>mBegaD_}P1Mf+qalUFTGgil2qVDe0*tIi*hqlYk zzQVq}Sn0~tB4aAE0D3m^^7+VTc7dfdH^gG+(Fms=aG+PHCA)p@6dJc+&>Uq>C~m0A z$v#Y~+;z+ly!kvPL_v~CIclo%oj9^sa7c|!!g^^Lh|~d-?(5;mFMS!PcH}Io3BB=(Pjzq zBC83Ci z&r!%Y=K6vM!a@vDR^KjD$8b5w;J)(rsWuh5z;(*Mcq3@(nphqZm_C+g#>B?S%0U#g zCZ*3m>K;=5plVO=DW{e>f1!DaO7SSrNkPx}u*r2Y5J)0bm~0nUi4rLG;?ah)j#OaT zR2F=-XW6`c;J)?D=2Z2FDMp^!IX$j3@LGs++vl9&#iikR6A z{7fLFV?1Dp$FCr)aAFNOp%Jr+S1c+YvJj)~x%*bncuT%A@sMP6tQP3 zA3@%zI?H;1b%k1*i9j6liFweMEfL8_Bq}3B<;Ds-T;)>Md@rnmXP$5N&EQyga6QoN zlJ?lQnU!wo265F93XwgKK;QD^w4OM&!t@w7FCMO~cFNV;v1HS4@sNZ~#Usk$qk(hQ zzmslD>J*#W-b;wpkeMP_ud^5-(A;p4|Hc1Eiiy`NChk7Jmq)zQ02&=8>G$Rv;jRL7n%0 z{6f2-I4zZLlTmD}FxWHKbY{tYL{QL=#_Mcv&;i*g{O*BWGnYC}qz7b>g9bVN+7$k^ z8$z~tCQhD;cIGgT?|Y(wxO)Ipbh-poz8Jp{9LbnB#2hW10RLLZLk>a&`dkr2{AoC{ zRr$knOy<;8;K(IGWVcO-lrGHe%`}|s9h`Yg931|V>HiTdke*A4m$Dz{1qE*c4`@M? zR5ha-Ape)93c`D(CMa=k@1*0G_3}9>80MKz_OsSm96!nUPb$0(T~dd?v^#u^5LNhM z%GS2eNk)N2rA$^8>@nigaLmNdM65J#<|lm(&|duTLakWShCUY9Ey8e*{z0Z2J@XhP zWgs{Kc>RvbV|u$mosL2$(v8eXK|+Pz-BbCNR64kZQDq*4kA++Gc)E{&ER}#(I5fe( z@_3?Z3M%8HbT&Ck!lwiG9o64bQj=?Uu`aD!_>%6-oL78Tf8OMMb1grO)cak?WZ?3% zDt_(^9cnJoQWC9@4T4v;+~vj?8SmI-E#tuco{K7;J zckp!g3}$ne7@sqx(4Jd-UZyVGJ!GbOrz_*`FpJdtH!ed6petfr?JWWH6N^oTxR(*` z8PN0E1*)$vxJz=u=P*7gZt1Y(0=ZH7Jz|n|1f$iEOg2HTUimwd&Fr0E|4nD zv@UE5y+f&(UpA8*J-wP#vlzTrdp8J7qW2DdFGnXP1E?o};Nj6U(kj6dr!Wk3B(j-N z{Ge%MX*F-4%2(wByjl@!&qhl+rrmv%1D$`q&7}{dY2QZEok_eGJ&Qs9>E;#c*nq>% z<3TtV!~w*tou1Op#PBt5wMqK`0kq|}(ixzBaFLs}5yRy{cCg;?)YtLNxLjTK$E|a; zMjl$@vAhsX!HPM*9c_mXc)WS44+-RA=bGig^>NTQd(PZUVaYP*>5p;I4A2oy$#c^+ zOO?meB(gKXR(X$*3B1!*BCDWG(pHQO{+=U)Aov8*XBJFI4|Q`@W2GS@-;j z#D)Fug7-r`zifP2E&ORLfb(zT-?YQaD3`^xpD4<>e?0S%A>&7TCH%Ltb>ou3HBe~x1RtI z(*MOBKU=xWfR}rrpMcV2mjN$#MwhXE?RS160RV+atjpccW$RyS%-^kDDF0^tqvBjP jznmR@0vuiYUtZ)d(xD@x>)QzdvSmg?mN}%j1OWUG0vzP8 literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/excel/test_types.xlsb b/pandas/tests/io/data/excel/test_types.xlsb new file mode 100644 index 0000000000000000000000000000000000000000..e7403aa288263456f3468a721206ad1ab786b586 GIT binary patch literal 8053 zcmeHMgFa2mq)+ z{rgUiZcs-zV@+>osH-8DmxDb`9uklx2LObB|KIC>*aOL$11cRnxJ~qjIEhs(9s>xp zlXWO8_{7CBcvjO({A!f*M~#eEz5?6#RqIP!BA?Ch)$%P3S*zM3q9dNG5`CTv{gvos@p zb6CNFdc_)k=atC;O>!-sy^?tctUI5$cYUtYLT_TzkTH6^X3g8PGO<7HWGPwUA)BUs z`No|>RdyZ<4W{CnHY#+wi8>w1* zaEZ>WinDxEwUx8v?eu(<2vH+HTx~C{56v?Si*3ekc;Za~Q#PFjdBB<9HOJT;owT)<_k-h@k1jkMsBDD#)i()_ekcDjH~Mad(SvF7N6=%o8|2 zZ*C9)s=qMze$Z;yIXoPd;Kam$GuPM!YVXR;_5JxTWB-R~`j@H4B#posi`l&|*K~jF z;*+BsYHs~InwN{XY&gq|j^y>R`3#chXBN!G{0)>>`^O{MO^ZVl#VtZI?3F?8YC&d2 zUAog+2V+f@FC@EUhDTZA*o9m|4E>+H_TNlYr;2JmRfFCkX5Z2{8i`Ol7gOWttQ~l! z-7do#o_inSlN0wJjg)wjYg@_YQB7YBshJEE)uFM?q3QYphl&m2v222OZkC{<%o$Pv zc4}UIIRWE*GIYu!^q8&JnQ?MU#H8d0Vq+T~c{f#Ekp!d)A$maL4Pkc!pxy^;x5YeY zPnGC##0UJa;9gaVa?$6((bUCH;8XuQPd=RnYo%Gj$gg46d0wrd(W39|= zLGI%LirgtLI9tz;R!%qa_d5~j>tXCHH55ob1ok`RpsJ~fAw*QOy^7C4vLJeV`~ID> zQ$COM3H-MnG>N)mtj>cS`N3;J*1tB#xMH)62z3p! zMTq)h#@mhK-Wk0%R=YYj>}A;HZpoUAQZV`4p|c zgy>QKwjfSd5>*`S=RVq_z8*;Q8b9kb`kJFW-kaLK?V zJcAZ%Y5+w^x6&FgFV8U_zY_5%0Vv7~tPAesVF14b4)t2~n)f2DxP_U8@#e_kC8${T z0;1LdQ^nvqKHfp|}Crdw6w5LwL}m}F|7yC zkM#suu zqc}2qc-+HWQF7*x1}2d*rq$^oS%!tNi(SDO_|Woh%3SD#;vMe_$O{)uwe9C&zzk*1 z9WL&agFb_%>B6GKA=!#{=_Rk#--{T7wU!@_M%4BA686Uhkz4@tx#GsrHHdsYfja5u)-|%Et=BDsT_!qG{2ab zR|oacTj$;ZLXIc*{ZB?uMj;K`z%x#42Zh;1Q`ZL#IThPgM^F0`y8;UohPufA(e1fv zOldIq2nF2r^uM|Nd%U??L!oZ2zo)DpAHaVmEc+3dEu5&x%NIHOCT_-=Cp_H|8Z)K| zcH2~vwa+^FQ(~bCM`x!FdozcrUnFv`>q4~_S)$|ZbL{j~I|W2WY6JHNrgubHSbMRj zdo4_u11FrlwnysBN*Evc;cML?LR4hkzHj#m1C-3fw9@$83|rTBuvW#oO6aM0!(PU4 zJTfMvkSr@utPy(>3OPNZ7WK-=?=0aLK&C9}o4?U)uE>4?-lrFxD-HE7G+W;EoVj~9 zfXe6UWWpKcp8^j?jN1|jAL*v~-i!QToSQY&0m}X33I5)T>^>TbCYL2@g7fE^qka6PKD+`YWC=lkBsa>IAH}NG=1k9OQB|OsSV($a=V|ML3<_^M_)Jn952bijR-Fr{9|{gNWpJ!!R9Su} zyb4Y_=qFw$mEFt{EL_(s7-#8;U}#CrA(<3U8|fqTB`6qgtydF&d5|ubi(Hr-|IF52 z-g#hHQS+_MEYyQ#-)3cW+ExXGjEG{K*0eHFRzIyWY>1p9W*v1&qsvf0lAp zt=YJ7WE8CK_a+#}cW?)6hk!f(`(CmVT<9dT!JO+V71y5R&G^XEN{0G)(wut3p37#5K?LcNd(p;hQl?V z$W3c>wFTMaOy?EK#0N-$uVv5(cnnIk9zGMf)XA2Vu!+w@jKQ?&!bpqAqV4J-&`?RK zS^WU2XhHD4_;#9+D$(ry^*BSPYN_c`N8q(Kw2v~+!nFVDaPh;<)e!O0zQp5;p41z; zqNQ#Of}79TPyEjZs`fQ-*twJr&&sHmu1-4Ut8UKb&y7XuJcUKT9|`cyKQ8ucF3}Tp z04XmI(ONS*8KgUtB=NR9imCd0flJ2ppW08W9om^GYGN?FtWGCaMB=_E;P(uk=gk1A z_#XPkh+;WIhM>A$H3~$Z`@Z&f-jm)Myz5ELm=YXb#`scaMSV8D`oS^ngGd>SOq1=B z{TbHoxG8^?2f}qSW|b3VWprspyD=KVM-F}1 z56ji?>02Pj{s)aV$F!QlR&4%~MMX>p;(=#4D_u6pC_pz7ShLPf2kN7G#byl8rpmYL ztD)Z&_&!)b@`X=3X4MPMisOboAJGEkso$yZ32Z87l}c*gBtmyhfQ%N6=%$0c$2L zX1Xs<8F=2kzVX+ob>cQY=e-b#+cD}6r&?`H?NiYfSK-#E;~t$Yo`^e1md*a!=^O92 z5i3h=I zHIAq(vjYh{Ottcplnm(^Qcwo}LX3y538U;6EF5##M4SbWS4%00BstH~FW1hN!+m;i z5OEP`g<^uw{0c9V8+}#{_Xgv)xBa{8=m2`V%1DGS9N8;$;RrypU9m|Xke9jUn+M@UP zGjjSCMc!L2y9C_py0y6k?Q8qBw(W+ehkkKI<8JT3c^p|S(qRzS&|`hh$L`fh7VOUF zLg`t)%x+!Ow5YGDA88?H;vo>DOiiZq0R>IH>DX7C%6OnQ(@jz1;N&>^#u!y?wLyy) z?e1(N+gq;}kh}F8EP0Aelge>laA*eJjkVR9XtI4tai+b9gTXpLDqmX+cfFd{L{?Wh zsQ}buAVr7$2)QgalD6pi9dYHwdDhp>eeU}H(ngGJ%WtnJw9bJbSdO-mrJ6>J=R|e? z>};{2r~t+#pgx)qjebW^!1ncS$~5Yx8;@5(KaR`18pi!&W#bagimJgOeFu*t!lWKR z`TXV+#k&axrD<=bUX#>C+WA@csDjrJ`)D>stqSy@Cx_ z4@v%3v;3@9{E}L~OGn-NnYcxhnzOhVW&v@iB^U&eGmpHGH4jNZXbdJFjX{Wr@#D`( zBTM8kLGBhNC{I)>J#3Pnu0TA+KkN@5qD@d%wRA?IyPWBhX8I1(DC(9=jihmT`BS`= ztS8it4{D=#GLvYa)-h%Ai-fgWh*-@jq-lmDcW5$B7L4?s%*n9sHRcP7o2oL6>S@&s z;sSgo9ya96^-^h)YdJ{QCnDs`<#3W{l25ji7S{>pE2>Rr3-xLuT#4%NiHt$0UWW`b zk~x}5#J{f$$q~e6>H_N7^5>PHj+Gv4+1hH#Qz{iUwA^FZ%ERWIo{CVBr5d8X0#GU~ zrf2wgJK8or*2~muw3o4hO)B-AT3=KKVf5@DX~bjOaZ%_1?PTUXllnwSTHK5F9G1$O zPwwdV-R#Gmid}!TRbSw}*Lku0Yy=XOlKAcP3HIf=9F#xK8=Fi%J#syPXaq?_j{AC+SA(G z!pU6K#mU)~+uX?o`nv}GZ)*-;y^thhNIMT<*9YZGY{l%wM(nh4d-`iMZUr+pC% zqa4$n^agZ4-**TEd6DtWnv2|T3F7z%>zM3f*AvXW!#;x{B+f^diaJ*P22 zu(0pKN4~29RPg)vzW?_=4Bi5px_HYwT0p&i^kL&kCIBFaCItvi2nY+jmmo;Z*4`Ts z&?xz8O0vAWx2+mC0-8obQiyn{$@E|PZBuzP+(>xv;~@e7xPJ|PQ)lOYV;{Z(zh9Y2 zQcm+cn4$ZKXVjRBGSQw zsK{I(PPS)HXPchCe4nUYgi}eYXa;?ESX6j0sA|nXQ|UG6B88IDy=1LPOW#hcTv%7I zo1Vp+s#un%V_cvcI$2tzg_B~&HF~vDZy3E&Bt_!PHkPrgMAJ=Xufg|ToECb8f6U$h z2vMjpC5u~M5UpBdZl=I8Fdz#LKnn>~?98Ia`I0}IrgTGNG#n`#7U|hOi|se@+_mMf z-*23Hc4vl>AdaI+=YxS)S5aN`dDwE@@IdZd1JnH>Zs+k4#M9o4!N;hNUNi0HihExZ zh4nmn@%W9=5?6ml(g$||jibxdQz!UB{KgjqASZk?`R6mlKbGr{_dlE{sw(~p@as{= zABNxGjc`u<6)ircoGw{_g*$0Nn<>Eus7bjKTZ?cw11p4RE`n`w1WpuO5EQg+J@N+bFk- zj-MzW@BHSH+a<_tgkLLypY{Mi7!d&QxBB3=`RyeB69Av|_B3v%^xIg!=I5VC06;Jt c>(9UPd$v|p1pUbKm;g)wB|LhiDZX3$AOEjL!TTTU$X=(NLLR0#K39|9kxpTcB30(PokhRENJR zN?wf@8-+*iPp5#djk)`p#4O;2T2N3}R}|#+i=BhC7(PuzOPK3wt3O-a!69>$o(`z) z6%CyMnH$5h-IZ%wSH}AaskV>zZ-13im6_1~=o27q1!!&kwB9`LNQ}ZN?P?_{2Yz$S zFVjyi4MZrknRn~M^=q^UxU5!r8>hHzcr#@9mp=4T87z|uRV67G@UBZ~QL&)hIc*^H?&v{V!w7usJE5IC zcy%GDIZkYHkf<)&b+;XLq4Yr(LF9_5&zjSLPy=^$xI9{rz<5}Z#8`#O;+&u^yigNz z&*{>>SNS#1=p|94v!0#;0g69FyoAp}>j82`9vRS=kP-jD&e+PHljGa(zmfhw9Nyo( zdT3O?LNgbrZBwRJV)6KcjSN;wSq&upJ;62nStc8*vamEpv4efs)okAK%U?Hl`jczl z_YP&(^GmW8_&iebG0bY!98=pGs4YkmYnAMK%N)+iZ|AG)?pfi!5~)lRT(_rWOhU%G zs`9BnQ0YKOiLIri`>A@9BuhYw#J$Ha!yl@|J|RD`5UYEu>Z&3>o`|W~JFwC_R+eF% zzf3leOkCq&g8u3GI0YXoEw{D|--9$toXc4_p{o_o!)2z)D5$oC29_ODPYYXvh$&=! z@1s6gzWGQ8_5KW=!~0a@_Cmp(z!@TcM28}Eo?tv8gf`prmvOSto+*Zjtc=4z0DuyJ zh2m_*`Fp51+1gne+S*!j7@FJsCu~rV1rqt}|MpcI-EGm#MId$HcY=s>XP-2ez>X|o z@w1xT1**Dyfs`}y4h5Xn=aSvQZG0bRg~o%}5ax0!ui308PA1QOq%Izt4tYzdQC_aT z<>*}RiYA_Cog-+f)aZp)Grwt$-;*rcHof| zX?3_oB=yx-i*;oX&gMl8|IQWt*R0Jjd#>M@46+AMx;5~iehzvg>B5*hMa&?4UC>t+ zF5Xf5{R6oPdm$#FnkO05Gi*T>dx@$yt6JwVf0b@2zzeS?k$!O(=^psVf213I??4tN zFf)B#LouMdmR7JrR{Y zm7H95mzYL)!mls09Qi!jDZ%r)9P?0@Z;~FybTDqA^bl>fHj=U@;OO zc`;_*CPgzem5$KVx8xEj&}KAg|Ef^s*M$^~T{n&TgQ8#+7eGzibN^LBOWZCc(1J;D)#) zpDqF6fSLvCwCFC$}GsjaQpmlCrFZ`!!E(b0oSbREN=Lb#CyO-Lc1l^!l?MfowA?6+f%1I>( z(OX7*a&y$H_|!_AZQZ$#-#Kd?&V;6>IDeI*fubPc#ttjP%$*Cn9xLoQG@99%dC4q ze7D5zU_~HSVW`30Fme7ceEQaww|zZagv|8Vb39@o02QiQS06A6HK+&x6~uo{jbIGL zEGKJ28IXrt;|M16LrPIHyvm{8ENta;rR-t@7hqLN${N`{N zvo*7VzEzWJNRLb4a3*;CbPuHsHi3~CAAx2*Gd<ok(e|T9kaJanskQA~{ZYS3@Sl(L!lLb%EkY<|_{CiI}HXd;h?T3O-Us z--x3P`3Hh6&wE_lSKJP!ZrWSTDYRfh$k~k|9ML7<1g+qv7`1fDC@}(Tb*mdKhZvU2 za$Adf@Fi$Cn_7nxtRw%b1J$KrsF-koL>@zk?z%ql9p$BnOee8K)6(0FOVW41Yh*zt zW`*d_=t(J_2O6mGNT(H;~`qIZiOScx}qnBEmZXgZ*^O7J6KP

    O1s(`2|7~Vev~_2Ol3ACU93I4 z;4D1-!o<<^I*-B!DvKvFGJ!sokcSIiUb6Wu8DA2_2*Ej?-cBZtV}kFl-vXhdv#?Btk6Fo4^mo5WJaf>HEB!hIo~rBplQ3$`><9 z8fJVb8%gb|^YxvDd@DWOMk*FU6*;p@4#1u0!3?x$gnuN9&qIz<5wr3_tCcUfUOC(a zGn)rTd(V?m^k*>T@;RiEdK(|tC2q5Mt+#V;>gB6hRv)6Cjzsl+w5gc1$g0-N#DIzN z$sO)Z+_!%f{!M4#>%%f44|qo1sKYh}n46L|=B>a>gm{;5fc(wL z9N$8qTU~4u8FPofP)%td%af7UIw4!`92kJNV*VsykqK8}!R{UNG~sPNhWJeU%S5CW zVG}RQaXj~IhukbIww)2Ps4sLs8Jpj+^hxJ5OBI+7Xc%D?(-S^D8QIBN9of0sj-Py4 z^vl}eK-xlp=zTrBIlOQL{am-L=<&{PPX=i3y>a<`7-CV&Ht+d$F8YDTNi8MTURlv; z>$7_vuKSCv8p$Uo?-y_mV;dIe-6R?42iWa=_g&D+DcUO&qKo-yTbWyiQyS>D3iRJ= zqyTHLx98!EF%|4=;Fz(iHT+g`9A40@*aOawaAOFip5<7{>z2v@+L&pf<3BNZsK~C< z6I|&YivJ3B0AKel7nOUNb*n4Zedbb7+A>XZaT7{V(5<7(qL6f-b()Ou2TnPhm5VBV z8%o2irGrci0**xH;-wM{f!(#T;#g%1iSKK&yS1K;088)yjfUEm?uL4xhd=+Uvw7vF^1Scyt~5LVBV0BF zZNSzfl|FV1+v$@Nzp0A6{4;O=&E4(5HqWtP3cmu6)SQax>yK(!Us!$6E<#a%I@%Yc zUJp8;wVIuDCfy{=j<+LrAA2*iXSpRY?MWEG_lLQ137e`{jwOnBtTK z9y^tUD&3}iUW$g)Odd@2DBAfb0+A207$|rg1mkZZjU%^J{hA4`a?Otk_cBJX{3e#_ zD;M{r*N(|HEWjS|`szrmNb`6fUGc;caZPRRvywE~zBV(GxE=TvGc_LmS4AVNEGk7G zANh`yDVxTs-^h*6T$Sm*JIcXl`WWtFL&>&0?OTP}Sc;~nhhPP=Qz}TrMF#EHgej4d z68nXKB0<}}>Ci#PXGW?w#JlV=V1eb$R}dmjN%LhsrR65a7=wL8>8X{=L=Uxgrx|6n zFgR1urW9fsLPAI!J_1wEHQzh~IUb48r%$wTm*yDgWw10DvUh%%G4s~u?D7h>ao}TDAETo!;-nk*%4z%dD<^OaEz^^ zn5W=?=2odiO1>t$=B$$)Awed4>jOR-2oG6+${Y5$YW>rFj;(hdL`Zy{8GXKF&_&r7 zZ;^~4qd5LVl^N?Dq8#>m6Kt~Qu=$!ztE&U1Dy|!^S=PQ7Q53o1ZTZ;tKoKj)ruxnV z2519zQq^yc9tIR%1pn5%2Kzj1X*rvma@BEX|CtSPDhx!+71wW zHH4a6FUbWM_~pNR5&P9L7v7gY>X;YR8m68}Em>>@Wm^U+p^nMJYw{ph-LAQrykQ%l z`VhD&VJo%uKEMRS9H`~qiAN)IL#Bq`zp;~J0ydbJXE`)*$CZ6DkJEXucPR;8KDy4f zEs|BTGuK11GCVIOAh=hwU|CZ>JVNj;nh8g4XdPmNYE*B`>Ot7nen9tja_ebxD2sU% zcKtFDxyfou4v^V{bs)m#1pQaFt>DFF@qg4M^88ho=s+TwzUG6OqM=2rFP~{$6ptMb z`8b)FSm6A{^ze;=|4tI$j+9hd5kh4QLAOst&xtq9S?F|6zmy(x%D1yo9A{;F-^4i$ z*OGOOvLqq1NI=uFw|_ME0ZWQwRM4bJY*bOXCTZe;=lrex`b}Ev&(;mj#tAI^%qsle zRn%3wCq`MTA=y1;J5hN(_PeJ!jx&1nMmMkEmEO42eQTt#y?-mk5@nr#{8e(B?Jui4 zUV+(&H*$s)*>wHIj^pgzf4$>i^Or2;=62xj>NSuiLj6r%oG7j_+lL-M#!xcKA6i3B zF+om|#J%HXwWv>k@+!WggDWV~Kp=!zl0*iSs24}UMX8G$==m)Ej@KMsi8v*>P)Wc5 zf$(|X|CjCkKT!LuvHyRjc0J;`*|%Q&GIG!$#~*aSAKPGLLvCQ_a>oW{?0lyDV3a-p z6`jBvBa<77eE1i=8GnHKq9StT43X_JA(Ewn*%~R@*~0BPjco0V|2Vz*mz+YH&Miv( z9)gS9fAh)-=3t2;j%+!GW~vF9t|2&Wd$e?hkDK|fw(;=+>3D?HLb3adi2G5-n&AVD zX)-evK5hv{tVqnL8>umgpgK}HPk$k!cyOKtBWx9b!TzZ`B!;z%3in>M(o1XXHhV^M zW(|so`>&YW{B2WAuE25EnjWQDn!H!GEi!a#=krSAl858QVw+HGc-2z|tpd)wXC?T{4>GAktS2$7r!4tBeP%SC zaDG+!s_a-Y@AXL)-gTX8a#g38G#7l&v3gCdXZ$JEZZ+CN*A7}rYhe_PY5OX0khoAr*ifCyFFJl*d&w-R7VAw$7v#TyG2b6Y21L`AbnB>YSO$b z7ADWr3JlWVU}LhUT>)XK6c6LHuRZaowi_)zil|aF z-)C{TqGrTiO>p3;xG;G|+ltBBCb-dHElZ*}Tg{7mdPp3piTjjOk#2`|{uE9ZV&&Ca z;mBvZ_=|^jG52A}2c0Kiq`Vj&YV9LbV67~82BJ2-r! zOD^7@5tZxFy0Xn;1nsBhlHg1GSo63sIxqQx(qVVU8F>qfFL&Rn#x}pUsw5QkM8p;E z!EaVp613>JtdDvjU8yH+;pItE3OOi0w$yRTPF z9pgV8@_5}pO?C2A)DAV^hUokq`oSC)_m+F|UuKihs;5hD$TmMomr0dZ5R&11*h~qaFBkK>x;h>u`iyJ-j_Med^{W)d)<4Sne zef;-II56rK#D-ir3FP?O%6hhLzvDq-QK1oD{y3r$75LYhIso2wVjj!{2Y_t@NK#kJ zw~+l^ZxmE^B!l+rZzjGy_5JmWFDMk{egydO2=;s7PssfF-GS^y!5`10z89oK9zGyl z_AgGTF5+A~lsHE+!2Yu*`Og{GMd6D~{<*L@&Y!}6$Ln7NyvTW<14dlB2zbtbUqrdc z_nxCnAQ|Ex`>fy1bcS`#{$2#UNS~eqP9bX`67VN7^&-MWF7q5A67P3}Kbg&oq8B;0 zb5TZuGtr9-+(m$k8=G?g0NLM|<9wTQ5%6NeeGZ6<+&27Z-!sdeLH%=-s$Zj=tJD=~xZ&QCK{zo5lQTk%lIhV(# zJ$sRVQiCpn{^9J?lMZuf5(?M@JoqMGn9Q-~s>uX24Xavuz*-0APy+ z0FVK2F-;X+oZX?$?v@7L2&kJmzt>8OeDO@Q$(+}6X#Xcq1JTeu1 z`AYzsRmy0AIc&}GEM2w&E3fJ{tsS$ishlXy>`e5MDN!0R(x&qv_17fgqDw_rTtBsp zXl*F4u{MSkZ0VIQkhXud+5#t6k~nGDwzYKTQFSjO+ab3{Kbf;djZ`dndzK~ir0vZl ztKHx>a4K1%c&5W6NRJ=TjR$T^y`={dt9G>xh*hy6vHJp~ZeZD(Rj%`F#+#xHR9@MR zb+XWbd+o};uuC{xcBxvi>ZO^wpuMW*M4~uCeta;!~-=b+f0rF^RG`W0F zfTp!N+huJia*P;^{1orWG2$g2`NyJnv7A(OS&_Myx|_;rkWjQKaFU~pA-4@kc{SiH zIuMr2v}-xXhT$rPKfrofIpBz&+~I2K<6jsDhADW9HPxp15HePG;=Lop-yfB{NK+1G4_9$rvF-cbmAbIu>_r) zDh-MYC#%jXxH;AJ;QVP~ZlXChXS(W`yeo2t2M~@Tks5}to4bQg8>ah5i{6SW@stJK z(+#pI>@c1%+!}5u%aZF*8W`e?xcG534$+V4|>9^=EteO)L870{p{Gg`l)f$XAW|WLk2$j@{<`jI{m|+`X{Oj?vyT34L>ETcZY#? zGAC(7d6=%6s)$_570~hl_#tFyN{00Gj(X5&;oEVyqpC7q>87D ztD}vJizB~{z4PC+!9WK(G~52Sj}pC>Z)Vf@&Czd)tjpo%6x3^|)GK@mh_BVG7A>}^ z9B$&^4!SoIpe~S_#n*JWgW6xp+ib^Rt#0Artf0rbPv-QQ4pcrq+7HCFSufoPx(Q+> zb?W*2Vqe%JeU$W#2ROl4nhVj_mX{2nKS-|-%+^klY`1U=dX*3?>NA~fw)mwn+KrHN zP`qP+J3`6_KbJN<|Ma?N(dYMbqG^YWGM4f2?Y55?!Q2dV5Rj#~kE2X?@6HRhMTw3e zj(xnIVoH;q6)~cYM8;U=jd#pD@4B?37DTwtE-g5#3NN@TowN!)`EY8wwC*Cjvxe%? zMsN48b`!0oUf7DB`GOuq=z+GIe_ML2pBICsuI51XPXSlz+}m#rdHddre=}T3J7{gcq7dJMwmo2HyFbi48uUN zU~7kwXGcv2tXXBFBaboWg0>YTt(P<9ESh9(gFJifX$q6T7k1EOaOU$4!Hk52oX~`C z_WkQ}=R_sB_0dx=&@{RH=W^ZLy`7+L-{_IArzxPQODwZFJ^Ys9HK`5Pc1pGy8>u;o zIg6&lRLt`sCD!7^$!(%K6ww~e(UC;mZmD?!vBGMfkAYZ7ZIYC*=Tg=#Y@-24HNip{ zFzf}IuBvBBlqUy0MAEYbTa@UN`ilhe^~B6c%f%5GM)L6_;D?N3UKZ1k35Pm?<9HvA z0hw`}r49~c2v7!gX-umw6aK6zy&6Nb^WTZVOr zAq_J%eM#v>{0H{sF0bR;>7yzIG^@O?KWgdVr*-;>v}}qgR7FiLm{Oj4mf~l{zz;g+ z#`DLb5CTFOi}mF!F{C%2&tQ7S3Qw|oAe&N9_Tf$isiwQprFVzeG^{SUVEeTO<}7_2 zraQ~mK4CRyw%(>&P71FJ_s`YDK|t=>%s<@NiR!Jy;?sN({nS{CFP6`Q(7XZoJMHJ_ zvZuA6wN8)T0oEVJ<_3pC-QE83SKn`h{_$2$gDnndmSWGHWN%uzTW0PFc1Gw=TE{!C zFv?Y?w2P$1K-G2*_V27uZa+G@o^w_eYBh^g}xOcYbKg5bjs9kS+|*P__)|g zdC=ax_k8jd9eni0E7P%gFPWJqFb}(H!dk0z4sj|f5ka-!yYm47YNhQ3r|EJe#aOwu=`N9P4Q(Qd_mb zT#S%ziSEju?~mLoyYu#Z#;_qrfs_Pm@6{j83<3t+Q^qC4auB~j!U zlG)2hwoF_Zi{r|E3Rhy}BogAKXjt~TwmV#A4##%~?k0^CU?l~IC2$4Z z<*$;3*OWVbI=viFe6?(L^YC~-^C^baERK39zj9W8%l*cTZQg*juB)3CrG}1mN0{g1 zi397-uRk~x))_y;hFlj_KiWs$c6;%0?s7NEl+CxYe*AN(gcq>lsG8ItQPB9_eH)wn zDlJp;kYskSe2+}HhQx>BkaD1JV`?iEXLp!%eN_Zk+<1jq)RrGVHeg6Dj3Oo>odfdF zuN%b}+Zkj-#3sH(O6d@T%&xigVBclb*TG@iD|im`iTMuHUez%uUjUvfITPMI&Ld|b zbAC3yTev*F%h5&jl;Qg6%F%G%eE9X-7K9%OV}n^f_PSg@+5H?h%%t7q{pBdip@A3m z;OoaEOaHS5THO8Wm**XiwEcY!7CLS{Jv*D8zjTz^JkR2*#L6KvwbY5nR%!WoaeC{K&i|#O(@aG3Fy;gZuc7fMFl=*gJEHT!#28IznSXBa>zQl! z>Uo|xt(v~XFiLqp5NY(5;E>5_?t>TQCP~p_S2DkerrCX$&C<=;UWYHJ`h#3uMWTtk z+$W>`V~l(Z$in1TIqAZ#`sr`XTMtBNnsc8C;}|6}p%RH@SJkjF2>FR8hrli4Hw{AD zh&cpN6Egi*;<-M{<%X!Io%Xx3xx|FK1y^v4r^>cX4lut$E|S$ZJbzS~r#8?Dr%2mH zu)__7#b3M}=i$Sy%+~ z0QqRO6w?wS59(udDJjW9qX-fRwnOrTMm!$b8i>jFxE4SnYTDR?Wj(V{)j?G?uoLW& z0n+^3+9lGvCVMki)J(7iaUd}4KDLc4z0s>Et0g%@9y_%qqy}Eqi%U-f zA#N@5SdX5K*@zmxjetoB_wfZu{*h_i^oYRYSdJwh*?G09*xI6itJ#L{-Rdkn&{Uu& z;Gwg-D4)>;GtJpGzUDY*_&awsUqb>9Yvq#x*=ME6gXAP_2-lEKC-wuO5X(aDN#PvQ z4MmNcLqv$`!p1umZ~7(p>QJHSEmQaU5{V1fjUK^`-cdT!>Xik1a~I*%7_K@;OEqyl zTM+0@wxK;A)EGX&Gt(*2a>w$mHKC+lQCsmcD~ea7Qd3wZ!XhAi<|$Xxyf4@gmLGI- zXV)@b@$2mS%th-S+JVOoPq9^WCgTh^ai@Z7AQhYBuzmN<3SN_*Zis=r`D5eiu7&uQ z2^)cq_gxNkaEqPmRFT*O8<4X%gZ7{ppv(gK=l&JQml;RLxu;paPC~)D&P<{}&82g* z4qs{Gwdbib@pe8g!^B^0gkaYyp98EzOS7M(esz3~7$|-3QIgmZWAuz(>6McZ?-EcK zb3y}AUjpXv<@SX&@eb3iMrczua-ti+LBPAt{qr<8` zd>={#yhi#Lvk*1!*Lb&O3oCa&_ECHqLn%v2?Z2FNtgjgxC!R`TyQDt42DZhteGBFB zC+X}uWFGpkmC_c?WuJ)mc8Qc4wwzN8m;ICJS{P61Kte8sCD0$0aR0VzO0O+^#!lT zZ=RQ0v5RwpU}Q;!$Yk;M@O$*xos8;&mE!h~``q4-fRbx#YESr{xED;Wv@Sd7`|Tl; zd+a03Bw`YHx@NH6xE_mC5Q=fat<$wG!_qaZKlW!6zowD|!Q7HyIsi(4P5E#dL%D7W zvuE*?nOxu!u4Uj&idD8yQGL#HWYif;o=iqTmZ{0015`k>mP(qcaG0T`f2AyG{^_6t z;ku_IZQuH^a%grNudpL%tJTf0Ib`tGjw z&UW8Z*Yv^emX_vbKx^w7L9`qXH6eI-*tQo2aWDEJz+Yt9eS4 zKd8V}3v~=&%b1Evr|y9JIwI>!X~;*6rp_8cqLAUE$t<^~EVog}T~7Q>{HlZ$H4;39 zg+NqFyhBFrgzx%bydAH85JgH5IYUdfvWH*FR{D-wTT+?BkWtV!y(&%5XY%!uQm)Rh z&1?uef^e~2vp^j-t?P2P|0wGY#!YMS4mM-RP0pHA!<&RT7o*`8D4sA)W;pN)3c+k5ySlg2hR=+ zXyjb7qgBnwltW>`cB+PHjrNub{E@MsEz|T{G1B&(wCP*O`wB#*l4u3M z1c6gAh2jIkQl;ZxlX16p1A^=168ynuxa}>`1O&Z-Ufg78*YulbS*zj^N1_d%1U*gs zZ^O4nApWy^w95WEG85%pJ_r(oZUPUO2&U;I)^!L1OKeo6_Fh_JB-YqzKC{iKxDrv4 zur?7_lh8Tkfs9%3@j6ygoo0}<+>0OX(1~2kB?n>AIZ_>@cp?27k9jX$A#<3w4N#-U z=y-B=B!993_XF*O5$EA@7*zK z4Fr$rSQZ@039*&kKb{?uL<2C3c zm4V@2@j`>4siSm>gt1sBE2lSO(M`d&5z$WQ*z-a|qEs9Hp|7ZF^C(oIJT-!QIAcu% z+)3-CFZ^DX8TyrUm!}30qE=x|8@u?2RQWVVBmHGFGurR~ypT}!_9v`FM|o3e8s}h( zfykR-k)EwngubJX-QI5Z_>M45eV$|^i{)*&HEZV8QCJmq7&ccm(3^v-VOQ)IK#U9m z_q#Lt{BccQv#;gIdY@5-bv?-PueF%r@5xA7C|9$K4 z?fU!ro3C9u>OTYg+-Cn<@Y}Tx&53_C-7gCM+;{x9APQaP{=d7B7jZ83p?)CI;{8MZ zzx$XMg)eq>ehA}T`Y!yV$8%BiVoTzOXgK~q&;9>3C@unCtnhyTMiP7nyjbsF1h|++ z{Q!WXI|e_y!5>N0MU;z4!w-~3bPj<=`6Y$8i12eZ@IxK|V5a~8e#s3kieL2KKL9GJ uE-vGuH@}GWvw!}91OPCh6U~2K7n;tnya0m<#C=eJB5D*fOdgPGr*PtLEVF(}~=pZm) znnE_#jz-pwFO=MDjU2QYU9BvMGa77?M+N9^{?M8|zTcBVn z{aBX$CiLL%Q%kl!={DLlu42=-QWwW}s8dg3MQJn6H>l5A^XiI>VZp0(tTn>;6#8qB zx)9Q9tACX15zEtKs8>3*M_jVx;KA;mg5g*x-)@c%wJD3nL$7=vc(bEwuc{vJxxH&4 z4;eF;BGDxVkBO{ui0fdAffGR?@J)%hHGV{5FL$gkx;3$ zb;7#pZXTk^qA=TW%TSR{jiyH#!%^BjC|=#kPlXNIUSRL=FkpNgs2{^yJJBbsTXNPb z1;1#%F2hYGM;LPw_S$?j0zANLys$U2bYNoq@%umC;{ULV_-X1z z(akcg%m{(IV)a6?mmAi^{)5BXl-XZVXpy5RtnsQtvdH~*wqeC!XstGp@C|NnuFqA+ zQ-+YX2!tMggLr{OVX}r{wye=Szp>g*X2K$#_gw%9l3)K@%EpQD^=m&VH)`h&(Rghw zjC#o~e$eM*8FA&rk&G_5a4NZ12n?tduDIEl z<7U}PZs-o>pwoeL`pabLoJCDNV?D=J=^4~V(~5EL`VKRENowH0C0+ID?b=nfE{ixr z41_Q!)1e0qBPc^DbaJF=;hD4v{(|4LMyid_&vvUVA8%!`oPtBPq`2mj(_M8B7~abQ zgz;C)86iy}5d&CG6*LG4E(i>$t0mJP#^YjRZ=r8vW5K9zX8kWi0tIZlfbag_{;7}a zF>hr?5k3vL@f&K0|4uHWpl_lGH&wF^oquvBXxU{t8g$#xVzrHn#ty$2t}9}?v&gEo zeUdJ{{n@WY;&mc3KQUpjl5rfh{(-$m{R|2+JNBc}9PdO&Z^QK2xvjVFR;Hh}RG1I3 zDMEB)kAC``hcSYN8_{4Xn3$qf7-*qb=v(uYcOT3Y2Le^u%OuK>b%e)|MLwu^ji_#% z`j%gF#VxMIPih_)n8!ka_rBdtKEp()9>4abcI1q?58D;A?!**CZflj5)` zE;Q+B4%|FuKV-5mgBRt-dVcD4j>d8g2$eO>WrGxm8zmeT{6)PkRwFL7UIC9X5CcCR zEy3X!`-K<>hFCK97O+qI6NYIHCsgq#*(mHOjA^-w>M7CK{V!J z>pj5^l)0Jy2LF&A))7W&ngG1_cSsNrfQttK=;6;A=pQR-p(u~ic^fm1n`53X0wIKA zo*k-S5QYpfG>#@2Gp^Ab+d{!qn22kZX{;_cn6~pgYzLCxhp2V;-OE$o8hQo@;EG%5N zwa}NouMH%Km7~atpmJ@i4uyKFSTXQ9EGTv?-vOx%=GOc1x z-w(buTB?w7VfNe0NVH+5uUADTgu*={kDg2ol!=xSs=mpi?OQD{lbe5PRLh%OXKck? z+MQfk>VYah$J3oGA25pb5mthwD=k~s=~E_hHeV{RnJK$W_^p^T7&pz zmbsVWZm!LfV?ibqro#!32%N(hQ-*ZE3CgPANe>|Jy&lU=5R#j^upO0eF!#h!Ihjo9 zMA)XeR5mF?gi6fFfz1BCU2Qkiy_irvfqbmFTDX)(oRqFK!++Il`=a$pylqTfkkO82 zZQlqlAE)cg8IjW8!Y~(lKc!lxX9ekM zsZY;Sk;FON8qZ%Z^NtRvs(G}k&|at+1LnRea%_2Gfv#S zH9iTV=v_j0*nji5sshx1WHzUlMc38Qh(BRRE(UaHMazT)RW z^80Xxvn?vH6b=(tLDg~P+k4!kFhm*o1vk4sReQYH;}F7Jv*KEa)iuQ+1Y8o&@8V?U zKw|}3ma>$WTkBPk`BK3YN1lh%$@$KM#+xBenb-^v5I#UqWPdBDf6K{opc&vr5ug?> z-|gx;zDPY{?g&<#)QhrMClsiRYkM3YZX|JVa%{Cdxu0;xpK((ap!}6OG}7{&g@#-k z$CH6dpWU8`O&)5RFGv$#40Wk|Mr>Wz2deZ7DAYaBlrgcuC27`$EXv^_V<{<@J|*cR zshRgy%9xgN#qiZ^r}RfcA;{+9rujVmgfwa-dwf8|DyR`8euiXg9R=2q!*ar#@9 zT{52O!T`5i{e@MRNp^N`Lig*V5nI4U`Bzmz3}WC?0nC0190UaCFA}&@64oDc_mAO~ zX=g_0+^(v}qIrf19t_C>!iENI+OCeW>iQG{&C&xF3Mv95)jKW;lN?GQY!>9o#XPzp zlTR{aIr3~ZyN_>#n~DE>1g^rl8Ds~fsvMaV>BlgkTmzlYU86%X;cH*p?D{r?>)P6j z(=KEZBN918q^x~95_0f{C_N1c5o4SKo?cHXBfTg|JMfL; zlo<#>vZmUV=0hg*f5{w*&>}aNoItC<_lh&UiIAu4hG8k1#E&^PvaAAJQ%cu{P4^@L zsYQ;jr4Y%I#%D-y6?Y)9_+7ZVg*5v`uO<_BUH3jM zVWz3^x7{XI8kwXos@JXJR;Q2r*Rda3i=M(4HKu?yC<8ngf2@Ursh+)&p`xR`nYD?- zk9ANAw}2DTMF~hhjqy;pa#d|XnhdcBQFulDUavrgMSSG(0Jj6rY$KFHPNXeuwV=Tu zuH%`%02_Gd5^A`Libt<(TKrWzSUPGqLDE$0h9{!*>`Tlag#s3E>!xj#*qxQI2`hL+J_t{8F0JLj>Ea7=vBn^j0`B@?w0N}4$4Dx2VACv|!H6M6S*wr5?SUocRcWC&?kd_h%Y);6@GkFRspTK)1-6jGnFE$rJPL|`D zj);_NjxM|M&ZQYUo3OLVY~YzkiirjL))Lz-dwWY1x8+95P$7=WXPXF^lxU~a)Zg&Y8@MG=kq5dcGLjZ8$V1Sf`$T%%w5i}$ zPfttun{8#_h&G*^I7)c>iX`WJNMhM#a}A3dh37*T>PzOrfa2lKxA0HxAw*w8sf>WO zr@S(KPDqW+LXTO$=E{D!Tx_YM-HbYPir{5UtB=^vZP7jQCT;(S| z1=k81mU;N}yBavJW1A5VbMt*yzXh-~?E@n-aSPvUCezF(b3sjGbOJiOw>M*ld23^b zRGrA_1pHU)XG2-bLHrszNQ+1jex!>%HvE@|n~_6AvW;%%XQAfx^h;jXi!m==-PGg4 z99Ng!wkOEGdU^7#T{ZpY=Ib)tS$xwnsmF6N(jf+W{}VTenn#_r$uZ?zMD5gVqZv&k zdnLMGRWrcr**lBiCMZe{ci>DJl$$myFC$7?<$5uSqddsM@D~_Yi+Y4}z^zTyA(11E z?c^A=dSBOih9SQ-;6n-YCtCDdQ%-k71 zu`jil|N1+OXwlkUidulO1k!2?)6#{4U^aLQ*M@9xtn%XzkdvA)7l;U7i&(enED1k~ zWvmHe4x}U=lG>*b9X-RRP@W_SZzVkecLSk$Jw&Lgyr;dR68tKVYr)m5c2{A^|ME!0 zHV7(0A{1iC#yFESegf9zz=g|1QA#SoCvf*@Ke)qdV)Ri!$*atQ4|D8Jb#!T#8=7UH zDskieAu0_Br$m+uGp<;>sQF3u=$;de^T#&3#k=!8=I2XwCz$P-zaX(N53gbW1J zd9mdg$!zwD$se^_P98sM%FJbhR*5EBibmnvkbnY3Vnmr7ByJuPRSIZDp<-T|;O!%i zqS+M4^p{M$>a(Y?2@i7ctDqW-=WCtpCn=x)j$T(^m{6G|(cfW;nRsYRX{yY|^}cM3 zjz+O;+sS{dTEQeA&7SUHtt!yd$)Rb|XXx6PH}%nE>h^wZ?a0BPGC}g_ zhMl1)(%Q7kQNlwR!C6oEvY#)vICcO7wbj<%zr&L9gvI|w9_=Js2HK90l;|L`ZFSyf zE1i#h+zg+V0+O4joO+{C^0rkHOjWwDtZ@}e{M=~sp=*>ktiyO3X>z|YeTmh_y&aGm zx`fq|E0S@9@TgVC!daJCch$;|62g{vwt)%kl6g9mnA~Kf5;e?}UyWhKE{5^3<8B4LdiNJYB|+^Zwd&4qQDtv-d@S5;PUT<< ztUrlQLm})K-h3P|gA51f{f4pGw{CboXJN6B{?@~i#ZTUvhyz@Dg%9q#-7 z*0yGRi+-OkukvuQ7jt!VN%#rRaoMs(UCrni%2W&moaD$Bu>qJtgAv^;)Q-+mlEIn1 zxYjTlvuN0c6*L^Q{6_QPwwrH2AMkX2=;=(?5!{?_$k6)&>w~|Hwz67U~vw z{6EgD0}_&CMpqQ~xMR=`!)d6Lq$mip`MFug_GSLoTSPXkliDPveu{iGE+<|btUSIi z`is{W7kd!7N@%hvaGG4TO-A*Ncyq2<(-KCA1&O(0 zf=wdco=Hh5alJA*Yt#!oe9LhtESy#5r#Rt9a)L(6gfz!g>LROKMG8IZ-_T_US}6PhZgEY%Pw%jK zPy$nhkhcFw0*`+y0pb6@B|v0FC-ILY!2Od1%4OPqWSaybF2VI9M>r_&a;aE|orF8V z6WgrFaCu&njW+O3EubvhY~Io%uku>0guWKhgU%~~7TThCu7h9yLT8t-*;E**7BBY*83eUF5(LeOjQ|MWh({p6qwgqf?&ht+m%y{p2 zKUTDr%e4h_;I^;3>XaG6mQS)+rBF{=UAtA>ZutDX!(IyC)CMa)pwKTeeNk}&Y|^ew z^>VMo1?1SS`|04--{v-h)9RSJr@9V(Z zrvEh^F!*~N7&qoC_(u}p_(=jlI-vHywHKUCo|^oR>%h$b@%}%O0Q>JG;0P)F|Mgz5 z+NVlI4hjM?2O!QL{gLbbAkF_ID1V|R9Tn;T{tqR!I!h8_Q7p(51yU{z#TPkPxYH~3 z45VQ}01Wntj_-@iT;4T-*elSjqNyIi+PJObt=^qUc^SwL6(>p%0p_V#r!;hw#)@_P%7w+sjgW zh!2HYG3__4^Z>C|0|GI?7k->aI5@gl8r`u+*-AY!N`ff7U$qfj1j{-h{9tjj^5q{1 zNQoH+*m2;pkheyM2P30ViQr4khI=(hczGl5l^^pPlZ1r4?l#DQrR6KhyO`^l#d!p^ zEu-q9oc=`l>!}KV%^9&i2tpYBBFJixS-J?UvU8frRGuBghpA3qCxp`}ez>{UFrWKV zwQ{wTNZ}nf5ZY>D4oFKDkye=f+$11z>&f#Gbz{oOpe|$)clsy6T*T#K79r+f zbtnoFx|84IsfJ8yzte@ECVXC?MSgLsj-Y;okSx?u6Z9CYN~vFsqtW`Bt*1g)xb2PI z9)5rVW$|asG=yH)m0@$?0}N3ryolS2K8acPF-Lc%v)HRvA->zidT2g)72(&t`oS8N z>pJ>FBWq<1JZkYZf^#VZ%zZOo>d_?M=6Qxf&F;c87|^f{Biwday4e)bICK%oxY5rN zVU1PqFxnU>NcPit;aT_|f71zi$^P_o32gO!ZI41pV8tAYheKx3{8y4ubx?#MOg_D= zpu<^AULY?>bDxRnhS)Jw&k* zo6fzHvroKf#_aRys=h86D7UJIEny1<#kjn2^nH#wOQxQr#Q8|yXA^fLvu;s!@Jz{R zID^6YY~5Ds?3FHWi?O3dak62+oI|Zl$vFYq8JZs1GyczC`4$Q~9r!cnc<~jFiicfX z;;{(_o~pl9y8>`fe~sWBvw6U502a#x@mrqwE^Z5psr$hI2HsA!#lVC@z_*Lgk813J zqIqELb4Q3`R2>FlVe$E1K-7%az+utGH~927cBNXX?QxYQd^BuKn(Av<(27sD4}ubW z^(4cd1Tz7ZDvp~V$k#yqbC+*M3a+*bS!;FC$S!G&qq`*!Dn2ZS2-pYjeWP`e$u`lI zB!Lk2SH2^By0cKa?GcEmBoYk0(h}ygm~Y=nh?WRR1?ITDv9mu;}mV+S1pv{ODaT`yw>Gzp{M zB~efaN{Tz=WObqu8DtRm5bU9z*m+CWS-?_EHs}W)nJPoA*?a4R4cf2`p2VXcdC`_Y zP3FLa`Uz$YK&YN0RgOuhv zrkOiRy>Us4WKA{px5}D0>xK66tTvzTJ$Rh)&W}Dr2e3$szo%*MHdX%^{vTw1 z2Y+4;11T_>ze^IOyPU48y8zP!tCplLs1MTi1(EoLLSt2FzM6#AOu3iLa2Vuj#X&6Y z=-$ByOgKDE!3r0*_*X6r{GMs)C4xRBp3f8ZT0Kz99f)SLqg7*~m(OjAL&)FzRy)c$Swi_rDQBl!nmkM_cZ`JTFqJ z%a19QIF3kIl{o&*nn+pObE^IQn%r&k{xN-w?Q(#FVh4>6agT@9Q1Hr|G?qZ8U9tA@ zCos{R8(Lag$QEENi><`zQ6z1euJY1mPE^dJ9dZxPQ{Im3!1o)S(;!jn9Vo(w!*>;| z)E2aTLW1n9a7Rzdnv)o-W9Y0da#_NSellzAM9PI`83v&Ty4>?PtSlswdKrXfyy#n% zIF2f;nq9P!t3iLfA;J(FYpCo;#tw4C>NrDu#))=4a--OIWY+Jlo1ik8PU0QUWo20H zXBEqn6Qvb8Up~cBaIa_fWD4}#KZEeVZg!2sOlY194+osE7(JCG6n1BBd?$!AbGbJy zJaZW)Fz(Cita}}E!~&&fIo~vlWLXaR@)CL_eUdnA8&7S{hoM@RYYl%nX@PdGoD8xj z1uJg1)RQCVSSw8#;oa$y;~$BkfnTTLIfcm$6(9Rc+{a}U!I*sB+9s6 z*Pw}dISTHYMBvTgw+jVVmg&eQMI_ZwN*Rabs`g8nrp(bsLUT<=MOVf$hr?Fo9!F<{ z?{hfAfc7E1nl-n%(3ETTm1S*fF)WKxe0elj%V+V9Hvio?U#vL6r!m5iPb_4PZ6SP` zWJjh`{jP>S9-J??w@BAe7ze-jV5~j1Z2?O+pz&r_!bsQK6`foZML@Su_K`+k?t?{q z9^p)OWFPqI9K86FZtym#_K}_{D}A46D4)rvnwNGgYAH8MYvK@HCqkmXJj*XiA6qEyGtl=+?5fl#-DJ zLII}p;qs}Hx?<;YDT@3k_+8yjXy|=e!?h#gT3xsU!e80VZgMLz9W0&bxiE@Mz&IoK zg%?>Z7kKyU1$>m_m$MmgCLq;}-(u0}$H8&*@{T0Aqhb!^i^!u&3it3biM1!s9%?8R zb@9HGFYhQhmE>H&a36G;WH=R31zoj5ZBkt)O>-+Q*2+G-GCokrAsaSh70`?X)!<_C zbq}Rz2o3O^E?wc73?g*qD*w#hG=520Gj~%J5=!Wv;rA-dXNA%X{K$(EiFZpMI`n!j zQx-B|-C24G;Yj`DL*RwXkqyMG#Jh^pcA4ZQEGzun9Z04Scvl$?mn_d%EH?y?c&rbZ zym44mzC;gd|3-8jZ4_WNa(=&w-%$@?fBGU5navtbv+QzW4t<~MHt?!_`X%lXI`S=# zfPeteCiF>-nP@fQ_&M+Mv=h_!B^J@^9_hCp**vsVpKOVEXxc|iq;c|+dC&`xgL@Na z6}bm7a2SJ_bhP%M**={G#8-ZN+!eJ61!qA*H8T%7#OGCnxS*zB8rCEm$!Wng>!7dGLk^L)Jm6KFc|!SAxFb0cJA$j5N7hV`PQp7< zZ$4dIex&Xl)?{Q=OZbSgn47z#KcQ!;^?_FQP0x20jro%!6$o*)Z=Yf8TStUYzo~tH zaF-8u+qM2LR`HHL35$_tgcJnqC}V7MN0Lui|6E2;<0bkf`~mbowiHxbFn||BgKrx~ zT=P1OO&Z$Gh`!0+C_GnIKHY+mFt*b1Z8486s{0VrF*Rb zSsj+#aH3-CqB}uqxzCGjM?V^bsr8ROeq$5y>H=G4ahX)~r4pBYKFiaJX|{?Z`bD!+ z$~_MpOE&$u(ZE;1MrxXRvvv_YXdfj_-Tf3zjnF4fM)7o?$$kXqqeV9l*U(8BR5f;B ze^qcu(uE;D*c%l6Ec^{ec&id6APah^Ll!yt zg@4rHT{iSzbVvv2a1PuK>+d@B^p!K@0t`hN@cgYb?>sl4Lw#LcIWT=1Vi71@<70sx zk9-s|aMn@C*D{d$DgZ6fCw>LpRv_R<2(tg%Nut`b`u(F7yes z31oZnqnZT;U@ugFt^xL73n;L0G_qGTa&-JrX#@QK-8{GA8v%vbC>>smJ^?;S#mEA! z3OFc|a+1(d!J`cVqd3)Q*I8&V286MqdJ8={m}#^SXoda*(Vh)Fm(jaR1LEs;k}3X} z29#1Xd&6*3pL83P8X&ouDqgP7EM;ySrS0t7s&EIfsah{5S(uF<&CtCn@U7 z-*tS=&46Z8LPLgLvQW)Nr*vv^3BC`{Saza_KiV_ni+iNMj|$Y3ECgktdj{?cyAl9{3rGpGMUglsBU#uhZI{cdkF2*0YEtT6jj?h6qIR z3GLwQzr)Kk_#n{o*%F))A^L|U{xuHVrl8PJ1&I7dr7S9-THDaZK+fLA)`7{u#@^`9 zyHWos(gujmBU;{~g&8AY9{LiQWZ=EH5mrflC!&&Tga40HTnlOl(7S-)NF?*tW zT3SLGB+7LBgnSI~6TV}VerDn(ktrd;F2Tm94DM-JX$i{GexvXuTn%(x@d+unYlc5| zHk62wXq#wuG(pAebP*`EII$mon2Y~0&iTH3*~iPnXIZJ-4%^5^YmQ+r`&BA1c~0ln z`_+_{OK~5wJbNX=myT^CJtmgou8L-hM%R@Cxd@-~becfJGHv30Vruv%rg6*pYT0%U zl#`3tTe!_PzIaE4oGjlp ze29IznYc~NA9$9|6@n9b(Ha|m#_&D%Evl?{c9-Wld-TIck#6e5$m0NQd3!WV>uE!)Ws%I z!Rln^8`0@@&G+(2o*U{3wMewNH@nGED6#g6)nsGztD>2$f0)G#P}y4;Z2$|s;)vMg z!bh=0I+y;6SJ7`bv~ZieB2(?L=;lxa%Y~V~Ic-YuMkeXiM0ec+Tv!7>22OKlCh;fs z{Ld4vQU=YPlI1GHP0?W^&yqoWkzw~?Dn3`mjq+t@HVcEprA7MQ2v0coWMV&h*DkI` z_BkF4_C3QUR^H?&sOQDVxi&Q@GmH$R_5ntOMQ8aU9JiyYD$AvP(PqAIRON4Ym1Z3o?4CK6M)Z5OUubJLkr!`p>PdU>0)OY!J4cueEQT#_qO;^_VH&&V&W>g#jZ=efbwOkf<#S! zwV-S^6&50ymuY?q7>Dgu~8U@LvC4vd!c@ubLI!P!f>sn&O1i@$buhe8y4Bt zVt0jM`IED)ll{qqtq*a9$*13WB=HkRqdw!Tn)@C(BMTE~k@W0c65Pm(aNZ9csp%a$ z5NqF56Bhg2G;^#(>vF{8$n`_?MX2?Q!cig(u~x~g+`8eeCYbgj<&+oRuWv_=JzOI! z;{FAS6(FqtGVH%2e0~Y!ehC`K1d2KUVLx^-MIsJPE-T9q`TtdI|8Y)pM|l6z+ds`| zK*1OQW!Mkj@4i=xKmK~)wzr(*uK>SZ_52xt1F#MKbnz3o;jcH{es1`qq#qEx{^s5r zFb?odrF*0^*ncO5|GGH`+!**u$$jG=7v6u(DE}DGPp7zd1zo_1z;`e15vu{`mcK{* z_n9~_An;X&d%zD!z<_syyUPiIQGko^?@@$M?oj?xk`IgkTr7W&@bwRbzZ`Y{m=Z7w za5?xriXhtEIDm`8fdPPP?(P8y07ve>mG6B89xxzq3EMp&5$@jr-evg@Y5;IW*gYT# zpa%Z8BfPH<10Er8fz>@A8UEh@-j`be0|M78-2+k({0-oJ^%5{3aQV+YAQd3F|Lv6T zi-CXvfvaim0cilN(r*FpYioc3fs0e_0qKbU2JpUA1sD*x*5e+Kfdm-vC!YZf16M7c9UU=-lPnR^s0x(8A2)DRd2_)y^ykuhZy%@DSrp{;DP~jqW4&EfB@&W1AB1UfH}{5 zEa$(&dT_S`rZexcR{sv`K?4D1D(|t>0CC}O=lGyH05g#HSiOITb;r{IuOl$Sc#p*e zV8wqst_Kwim^!@2djAL3KOqu<2ly*Fb`J?4d;wJRU#K$R=D)HTKQ}J|5Uc-g{wKEq z-1^u2@8{OFPkw4`@b7F87yvl#zlQ+j{TuN<=u3WYayEa5^{1zWyyNU?Kxjb90S9Mu JfEFMi{|~n{oO}QP literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/excel/testskiprows.xlsb b/pandas/tests/io/data/excel/testskiprows.xlsb new file mode 100644 index 0000000000000000000000000000000000000000..a5ff4ed22e70c9d19b22488024e79dda8144a598 GIT binary patch literal 7699 zcmeHMg6B2qySu*e7FVvn z?;p7L%yZ6p&a-E|GkfjzuKlic)D(dTcmN~-3IG700*nVZSopvJ02T-U04@Lp-cSnS z0JU_0nrL}CTDlmqc|5kK%tC-?$N<2@zW?9xFZMv3R+maM2UZ=;E@pHogIgCI)o>*; z0}esHG`98V6puRDx4jzL%cp#+QfgHN&LQz*+!fqYJzx!8x^eM_!SS%Wr{1)t@k6;r zRvUI#DWc^F*_8&w)>IaT;(SEYQ(;Snm`PX@=53*bm$6v+X!-Y@mm3FFwj^n3>H~9k zG>R8+S}sj@DB~)y?UgK=8{4u8Iu<8dKn6q0Mzk*n%NIP|OQJiI4yNK1WSO+=if5N4+yGQ{&!%I*#unv$`8$Z>$D(V-p&i z7iBn8vz52vYH`<(R#hk2oo_0v3dk}F46nzie&I>lsBAWh=+;=Tf7g|rsHrLgve6$h zj72%I9OX_w=)oWIebKW(T&Suf-^fGJMgGEtMFl<$1&F)SRFV8pxwW3Gv_rM(L~+mGvijQtnW^lwuSi|K-+e?M^ zgQsx<{W{c0i-eN>UQ*=0fKC0MJXt&3PdtH*)WCQ`3_yYNuxJ0BD((u&Mu%=(l+`#ZfYIYn9|{B~QJhJ}6i56_{5He5Pmebsh9p zWKVd_`uTWo?rs*#zYoD|^+uKhX*V(G|G4-mzCvv|u_4v_}OzH|NUA-7w~ z5Y9(8%4j`dM#t{v>^KnVQA#;ShH9)}i?NGuS+pPD)5(lSi>LKrE*K1b{H?uA!9q{( zWDy7DT!^{nuYaB4OFAYOF^P(5u}CDRWFjL0A)4?%wG-*++Ivg8DA4Lle~8*yfUn=V z`T(;vhCG65>kHN1mv+^c3p|WhXbTPxxfh_)=S`ex-!2T7HX+=58*`niu;u<|+%SpK zCiTHakYVwF3A@AM=I;?>YXt(E@|cSQmBgPTN?`zLqWFNZbm&Fm)l`0{Sj3!!9NnM= zb$|lD2qL4IshXt)$aKjXoWzzvkGAlHQ;)NS<348&d{2jUheZd%95m28kSjwTJ4(f> z0}#3h7|G|XX zZA@94L`4ENhEron&m`@5iJX##nxjZa$HCQY8K>}~4dqA%O>Hwho`U_94spznw4{|I zH99&oAo@qJ|8wq=;4E4V*w9-TXYT$!cNeIqy`{?!uH&p}`50!z2@Dx}+D3rtv&rvYXyKV>->!+BRUtZ~b?(4wA|&#LCU(ke)MPg-*<>pM!X)Io<@+SoH%XP`OWL{VZ=l;U5nTp z7X!qL-dddRK|B7BA&mwz5{@K%uVz;5Y*c8fbUF->-F;;{c&4&ZpHD~jvzf<2^wFlO z7(Mv1#tHm%W7XN)>RaqzZ+{A9VInPfa_2i@uW9Po>Wofc9$C5eghLN^b(@4PhYw>N z(E%!QU|^6O_woRcXuMRlc zf;Gp?qU=`5#Vg`lcoM=b750t}A8(HBCZ338UR4HYPcpoWw9l|JP;226?5ptE?iyVe zVqomR80`R=()$cKdaU+UnitUOd*Nu`!v`udu1eXJ-9e0_qnoQqHpkGj?XFM(m-5Go zR&S>EMk1lBz9CNY5w5`)wp2abqY&~)&2A~+;X@+J`||BttG+n>HRm>s&_rQ?=Nt2x zW%n^&UT^Xzmj^?R$iMP{U+LQXTO6VHF6AIp!)F5}SJ6lW%lHb@z4fl(1 zx90g!xkXjrDBKvIlrZK9Sy#79HZMi3tUNn~h9DxrU=7sKPT}*_7P($(-BV9%1prcSLcedQ$lD zkc0&C?fhSLk>jplRYV{=(_Mh2$r&&?S?<-Xc<}De7uy>fHIgu5Ybt{8K!N+QgSiN? zeu2@9zD{hFqTuRM`{j$f-UVDG)7!^qhp*D%OsA0*i`ise_cT7KPu*ql-e~9AHZIn- zt2u$c9*ODQba?;GHm^qS4H8IkQKM=y7tu^|Ea#RKn! zN2kz(LZ5Zakw>)onq$)PFX7CVnp~+&vrq?&iB_N2PjlOd!C-W zw5?;AdvQ4%Yx4Z6ju_>z>fLo~qU!UfM+>dG=~q{i^Jpgt4fE7q(lpcqtj_*No(R=M z?H^KN%lIi;8Cr%j8>n_lOeb|Sfpxs?MQEe6CHq@wU{>vh^@{U|l4iATyy7S?nlREC zw&kKOnOvX)SRWBH(#lDV^-*`|hi74!Z$Zb78~)W|iWzy5oe9sT?}TJ8kvEq$!G(lK zo{@=B=J;-q=SG;g7qEX=&t$?xI#6EYXA&~=Y;$Kq@j??COk2q$C za&{MevWRP=)}H*MeCCS8y}s0>eQM`aiUUbfTJ15a@Mh{GpeKMabbwq}d&g)?C-`|F z|BQ#thi#2H|MLS`#~`E#g_j5e5UVWegi%!YJ$HUrpHgA>3D{BPflFf(et+jR9GsnW1c(0Ndp{ANYIOZh7spY;<*PX}U_r78c9$c>c< z#>RfkKvrTEsif$TquOwFe0TopV9K)y;H%qVBKfs(dS} zt?)JX*Ee8G1h)+q7wWXeab)?RZ6 z6Z*{fW`VsU&YbwVPif#B^^sbU3KYTXgFZgts=}(rqx>i-0tLx6Oayd{H}2xq4Tl_c zfjPeCkM~WYq%NnwzFIWvB<@YNO-GVf8;jIpKpFR|29^c3^9VqyK z%1hUeTD^qzHVPA_qO6}n2Cy#8Ijwjr3IS#T#TjV{mv)7Yy~SVMieg&Bb>5Ilm)Uc& zECJQwN0l6Fizw;6n7ph$vkZ`|1#e41WOgQltdMMg`p>>#kjp=iujLPH{K7T?8Z0WZ z8ya}{l=WK?yT@S9;%mq1uNy48qInhjv)%WWhv#Gjg%01%+tpSNk6?|*(xNF2ZBSak zTYR!)evaMNeoWQ>Z704tjL{|r_0tj#q18%e0g&N2^FWlt72>bCj$Zi zg#XyxZ}xTnn9m9=;#h5;$HO1`%SfWNF1=H;Yt)h7A@c(63T&we5A?nOuV#-Ke0y!0 zX}(Ib@;p_EjZ^M|^3*LUZMxfE3E@HcE-6tQ%2`3{j1Jc>eh`9beL5-|h-bM>eO0wx z89}&%Z=YpRYW3r0qHnS(J$@2z*tQ0)1p&oZCvQB%OsJJp6ba^xOB`;iXD@3Be(roB zUg`wvux*1n4guP_?i`PtD(N_$h*J%*QIMS=xO}v+QqYUw%_=}T7nN*m-RaC&_AOd8 zx#*sIzJ#!_8Uf(Zo$KMo<2;jsmFAO5zdpm0_b+zN($Q7YU3b=5g;j(omJJ*!$aHXB zCR;lX@fA0=$=XW4f|9<~$7b|N)-zo*4Ild%qa7B^5d6xqwTt}w_hAm@f}ME3JJtnk z=4=VlggV;Zg?Srj;)+71Jkc9KEQpesetHUI7A0-+&R9CX_aj5Zr zyqL`EG9^?UEi?@%yx-FJrgD>AC-Ys^nnDRf)HavUmr>MspYo*LP9kxs z&qs%dU(rUjY8@IO1iCX7QhyExL4-*$8mSyCK3pJtE~PM;DEy?=Q^KFYg8j<*Znuu0 z;Z`Njl&?+K5h^)K56&H{jfu^tG!iT-<5IcB^fSFr6MD}L-M?4MA|ua!G_8E=q&_vb z`0SIUK3+^2h00~p0!Vfq^=>;w&%W-ya{pGAaL`FQ|32z&wv!4S6mJmBLum}qPdz_# z0Zsoh8fPtIH8^1-4`C-c?EIr9{i8I1)n;bSo(~;BmL9*A27@uCAX8H{1an48StMeE zYVj?Vd@LFu`Wn2Fs);T%itBLZFh3c42Y6UJpV0|rkxoF5K8z8$1e4gja?P8P zziKH}y+1PfX;&W^)d)Qxsn0oD;$0ck%rH2wr+Lvjnkre1m;#752VmGbXr-@GC<~Cr z;IMe<9grU~5fBHq1EcrqVH9a`@&@uN8xp#6Vvuc@ zk)z;c&E7oS@pJ(b>Ahutyq?C}@_BeVO&QxlW_+|Bv2;O8OX)q*c?u~Rv}A!t+s;my zWn53UjfMdx59By5`*qrAhYPc1kQ2;1`!8q9j6ybADG&GqR+OCSuq+NATX3~}l2JIb zRuU?LLy2DqEaUednzGq&IPe0`%+!@VAe7MKm;27Y=Pgzk|L}lf?ljKBl+aU-^t-^+ zDCA<1Y*46scKKpPOqIoAD8+WVv3tWbm^`Ob zApO*L0efTn#w(B+PhqIY=d{I9fl_x3%{}Pg89Qt_e(z+!!L!14fM37i`g76#y#L~T zmzv_w06#a{|1|t@uYob+Z*BM6hCg>6|86)3D{lXHAM!TN?M~DU5;5vuIpEJu=56EK zt(+TUe6-(;ZyGzdO>Z|MZcJnE{Pny4Ut8if;O)Bp1~49$gJ2H1UFF{fxGkP;0Gwcb zf}gYDjg-2La$70fpmf570}SOKy5Tm$&#K_Y9sr=c2LSv-BiuH>oyKnfN?_9F=V{zd p>bJ3e&eAtX000S0UjFr0{>a&Ciioh#0|3xrpC1ZzhWN+R{{cg%x<3E_ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/excel/times_1900.xlsb b/pandas/tests/io/data/excel/times_1900.xlsb new file mode 100644 index 0000000000000000000000000000000000000000..ceb7bccb0c66eb7e46d4c7d2f0d05f5e6aced3da GIT binary patch literal 7773 zcmeHMgA002?| z4!VIn+|k9#(Z%GhC&J3vh~LA(9-M=L&Y1;3N8SJ5@h|p3!rfl=Rzac$mL0;lGETQ% zG^VjCY)%r&LIq-*$+>G#`uDqaY-e5~pX4FcMNZ*~Q^J+PbNw()UG{02rjhAL{BM42 zrir6@M%J5l=c!T^7`avYG&W2Y1~MYl^K+4_281a@Gv*zSsm`7d72*}LI;}O2sBg)! zu{4I{ZEKb+leC?gZi5pliS5-aTAMp^DZ5r?+AQ@)*NoUAMk1tx$sCF{*i&ioxw)qC6YGB@;SFCey!kwn@SNyaSZEp;L zx$#R(ElF`6Lu{3-ueQ1C#c4okca~aSR0ro6g+w(H)cAVRHQzCt#B^(J)Wdfbpl+$o zf^QCmj}d`q)?(c`Mm$8rKdg8b$w*X}78-e|IxC$92t^tIr#MRKb6SBESN+Z-{jGBt z_e>Vp(41~u8DPOz^gAb}ud9$Pcoqh3BZ+?G$)A_YeCMuj^AibwYs>5F!ZSH%ijlr3 zeqLMv0g#`VD<81beuN4~H54(gpqOjoWM%Ix!2kXHA7lT;H2vG7N5&7L7z^q^DmBP2 zpKLfP;bd3WgP+e5aT6}EInq`~<+92g9a?e}UaO%$L+%Y`Hq7>q7QPi#;3*Aog$9@x zwBMW5-X3l!eJ0bcFfha!%_Hg*Wc0}Q^`o^oO@_yf2T&_=N}hGC-N7*E(M>3CTV-#e zPKyFpXtun@!>s5BT1hcf=e9D9LwCKjWT(=wA^pQ^{gc&s4kfFU!MWDk3!5hi3G9N<6-zt~j9@JEMLqgId9rb^m;4nqQ-|US4FCtt!(QNbs<^|Q?9AbC zJAU)~j(^ey4Ykps*!J%}iZz?RhfU-6M321oMFuB3uU<{5UhbP;Z0((Dks|ZT;ddO| z0j?u{ssc&R_}(4uE*`ArBHPeds+)N@E9fvDlG=Z!#Vi{i?FZtRe<|4txQ)p|V&C)m z<$L|$@H*nm&n_P&#*4zY3y2I28!7Pm@;%&yx0p)RlB3`puhAZD1Bb^C22SwWl zxWgp8uH?{!K0lRqFZ?{QAd+&#AZZdC+h&o>0OqErwZt?L^|F)f>fL?Gwj$OZz;S@v zQ$(TH^XUd*dptul(^farZg;0f#PT(+bG&6oW#MHPg_9PcjQ6Jot6$*4yPJzW8mQ&| zXWY!|*M3|;&G4Y&fe>{?#m(O%#xS~ERSrtl;akwh)g?wmH^QX!Fb0O-rpO#=$V_+5 z`zCqjV_Y#jtm;qfW;G|sed+bNxdnGd_=e91=eJ0SwK-?&jxq1^4B#hRNwsg{;kOqV zbOQ$2GTqX(va4MAVl`+(In*O zy_NHK@;ukhR=9`qr4f!NkK_%4^K| zYBB6zF9dJZ``wwdnx3`O+BR3HEW?7cml+4lGgm6kop9q7nooM}Me2wKI+zz2YCjt$ zn111)?fk)Dsp%zEE9)HoaS%|RrAbThRM`ZcbB7XP|G>7BeD+H! zPdTQ;9mWm~UKotX zgUz-m=3*_JWFbvmOw#uSJHoW4%wp|6G00RVwp~k#vQpVSJaG6jwUhi+I{UmTSbLT; zBE~+;P9M@HB0gB@kL;cNEWyduMKIZAY0BY0itzX}SY=+srsqwfO-=z+<@zLVSAGRE zft`J^?x{J!J=?xYby%6`6RDanX#+7>AdP&Q4F8*T1Y=ej2fK_C9%;F4Mb|{I=nJ~v zU)*gh$$Tb+WRaM85$u_7zOd##b^W>@!^5-vQ3Up{C>U6_vGfuP09c~^-kkhKK^K^n zgO$KVrJS68btley`3#)I? z+Quj;g$2}tA1wL#sg$(kou@n<;@+WKCK*O0 z&MK$L;NEb&k{YVCxrZAS80~Izt+r~5sR(hiIkNNlQh)e-N%2~EKILEyao6H!E?$6Q@R0h)_`%zBXsTysRD$krW>ej6Iy`hyc@%If zHShIPA*8TJWa8KrDN_0VR8cZ439&-Ep!8k_(siQBXly6;Q6vJz^Vq5- z{EE-|n;$l&?eO|-c3wpqmuTD7eMP?*k01Er`1-wVLEXK4EK6w-)vpIL`pz#uF5q`9 z8nAg+){lQK5%U06e61$&LF6?~xa?q&U8P}67!uD4lyC)Z5l} z$0Kk7J;P|%>b|mFz;gjuj`&RZevE;@1_+G*K_#Q_mVJ5xw>8G#5xl5ta`o;tw z38Dj;KlZ|Q@)-$r?Uhhs*7{ndj%T zOL$+CnwFTo64f=&h%R{CuSIGu2A%BRLWE*Pt3+T7!>x0elO}Q9cily8 zwA9p+{X>xZJ7FEZlVjAur9L@Dujj72*7Ic8Zy3Bn(@C5d2-kTFI%2e6c<(`hBrbgF zMEYp*-TVO@S%RGJwf(kOf0zT6C!Ea9$r$Y)W8kBoDM%>KP8D|2N_}J0a(IopDJNeT zTPL1zF`h_jLj?D6+JA09E=a@Nh}c&?cL@s zV>>e?w|eEJ)Wzq>qNlfoRKOdWF{vq-2;CJPv(fW0a}n*gVb&7DeS87pAJXr=O9r_e z%P>Em>9|@|WM-Vl)nv}sy)h5-HxTIYd+g{U!lyIIM16jpuPMe6*6pI=tqpQBQ#=`v z$}dS6BqMG`I0bdsvmXitnG|qO31^dR$*bKSB1BXdG&&f+=@;XxTMSNZo_6hvCo1@& zlMK`8rf{UuEDiMJF2t_U-f)bRc*m8$EYOu;PIED+Hhh9>2q{)~!SJrtqoDev^2x)n zFjk&YMQ(!-14NMTE>qKVAlMM{JmAD(&m>mstJl+Ic~{ox>AmXC<=9toTmE(r;YSdhBF8%A87$D2<@uYz`$ld*8gb`66j9*2W7yOmt>0WPQEz1;8x0Br7B7 z%*RzX&oIX6m zv5x1MzuMczS-JAfP&@k-l<-FNkKO&pzHV!2nbODU6CWJj?2iUImq!Z|4u%309t7fW z!mKRUEfeZnK}&%!Y_K|-P?jlHE}jyD9xY2k4DUpH*HA}SR!>J)Q(NbdU}9i+lx);T zE@8qhG&&F-m=EO82NID3W8oAp1`nx$zVv}eiS=p0`qZpUBOszi(=!OM8$$|NSco3( zB(@e_4QU5jktI1G4jeajm{>5sD^S@v)Qh2shT*ZlD!|uxQRpL(Z+wq)aaf}Njl|kF zcbT3?I_kq-m^I$qF2Nk3f66N2f zk5BTB1P`2uCQhVvBu^dI_Mo zgqGdp^D>hTk24dGAZ_D^Jd{@thESS9)@@}@>-*HuLGCH|BTWlWB_l&%K4yfuQOye# zl97()mMEY9qUslOpCDpF&CsI;8ET-aerK4Ola-~Gi_?8ao8Ri))WNRiuBIkHOG^WY zSB5)?=?POZQ=%Cf3-Fha7=>j%5Y$SWp=xJhR0C=Ww}3do5zYb@a3`xj1lIrhPL%7s z03BiZRh)uYCm^Ue_ruu{8(~5^^k8 zI%s*BWZ@_0$Uuph2{95QBeeaEU%ru6d{(X?sHuvEIpZ%|xqHf6z@4kv`XmXjYQ^;C zZ4&m}7czEs+9BoHcCav9L-|?+m#ciS!3McbkYg7ff-Z#ggCASuy#R@-*AIICKg46cmf%Vgggqj91&A~@~H^j~} zB#g_XW&AzWIcfI^yA7PYbo-&yq0o)cB1R5vI-C0`d808~$4~Y$7G4c+vYm4npFo#8 zleMpX3hN#x-T`oWAxAZ!KBzeSXFoe@szk4d^0G5(FrdaC{p=s@1gecQbMjPnw6ya0 zt(_Q&HwB<$f|JmNV*PG)`TAgmWI%{-VQoSvIWzIvuK0iDCsMt7RWJyBM*%E#r7c|z z1S*yJfYavX@4-#_YaW;>;}V6VctDJrCi*)M%n*qGXnc2vVsqqlsSmAW=5U z(MX4HXh;+4hZ_{E+LpmW_%(MrMePD?JP>|6B;37an!tPXsq@>N9`6yx>CaPaq|v+$ zy7Pt}?FCg4M%KGZJo>vv^;F4v#eU%WYksQwJ_vt<9L;rDADiW7em@Gl$w ztYQA$a1qrW|KEz{Wt_{J)(<2a+`lrypPJ}pR*55k8BN5#YBZ3 S002UL=uy$jjnZfUfd2ts?A??A literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/excel/times_1904.xlsb b/pandas/tests/io/data/excel/times_1904.xlsb new file mode 100644 index 0000000000000000000000000000000000000000..e426dc959da496e4d70b4f7e798c73e141a40dc8 GIT binary patch literal 7734 zcmeHMg;$i@*B%(U8>G8yC=n1rKxu}Q&LN}*X_OwiOF%$6m5>w(rMo*MBn6Q!$#1;n zmFw^O2kw1mo%da{&g^IAoU@-D&wv$xNJIb>02%-QpaVtDPB2^zi1t-N>*412h-Wvs5f2y|mr zsH_B}h0^$zlk)uG0V1M&vGJZaq zXJD~qbCn`qfs|9FOKwSLrY9kEYhgZOO%FF2Z^pDEjO;QVuMne<(Q&u&7C;4zbLH+@|hHdd0 zaXy%XmlD&r3!Sz#@sS$x?P^C8-zZsAe9Lb@vIhFy1(`axCiH0%KbeiaC|g6Yl`FUC z)Ur718Q5CZg0Ia@Cssw3VsE+SRdrytK~Q8PPK}QTWwVmWB(iIBqYkzU&#ji~Oz757 z=olXD%zBI)%ZR&h=*Lx$A_>vz(n14w1t-}He^7)TaEhgrD!Ua(!smMx;b)Oeb6~W{ zgy47^bASO`#`g-Js;)w^AU%Y(jUdX)gF7#WUP;5JIbPJy0wVaCeO9&cR?t!(9Jp;d^0ej#7UYRe#fhcI`>c z-QkAPbcuH9fg#o?4q?Xt123PqUhA=H)M1TBs^-L`9G}(q2SZd(#8f%kDtn)6wMerC zXGy6%&5U}Yo)}GbWi8P-r14B$aw-)S+&{eDKUtk;SF%ProIzOcY=OL=Hg!vggN9E} zR>&xa9D}L=BjWShv?$qm(pwa}V#8~$+1F+5p@g^O19TCM*6z6IBkC^TIL~F9cb17B zge(vQ!Mebd#iB1@;WULl@TLEgC!0qHNvH6cIyg_r0cZ&BwmiR6#SQ9cV+w`ZaGO51 z|C2Td@I(h^+rRrNR%`z5HjUo{{f@v_3AC)ddPUj#``>(HYL%*micBkqKd`X-yNviM z@Fb>leK^@)I$F!wX+vPBZsuUEphS8~X#0f%xomv2ABbl9wPf4>Auzp2Vr z@M(t6e1i{M_<;M(-#y0K0%Bz>U@8Grl<-ED!T~bG2*KitC2Htk=y>EHGTuJOiYk)4 z@Lgm!6JxNs8N_(a(khudlLcetDM%aC#>)uGMeOgg>@w>@T5=9D4dTm`#g9?8=mLbT z0>%qLRs4K?W(mlti1gWHWb%h(Wb_iszW!l24}!cO1l@^Xr$vYfS zVAHWxRqGOdN31XXoI%QEf{L{Zb;;UqC|Z5c6rF-KD+J?6q7SwZ+IR_o)e@zJDlBg{R}#l9G*ClQn0t+Ko4U)_|m$SvZKGCVnw zwmqh+H}SNVRDeETF`D`Gn)gAyuhP8v^qh_QuBmio848rS%+P; zLQ5ECXIh}InLdm&{mM?$>7(9q(`&L;#(C_s0H72@lREE(oRI^A0kg>}#b+m93t#VS zB~l05C)SRGE{%dl)5r-x+@2^Y0$!WMyvd z?DR*F`t1$quMlNB*lZ1_Cd%S@=8mzmQQ9GIM~M2ANsP?~wM6CfHi5)QbNT(_BfGCt zdr7DFvaYHEHRo8vqir*7bir*xB7>EFJH3-%L|NIoa3;GT#w>oL4(=O+Ri;HuI-UfY z#3Vokwhbwpa!lj|X6B{37p6Gc)_s-AR%OER;x%7W2cl81RPxDR`ia%yjG3z(?bC?5 zr{=U32?(K36?D&DYc!T*q=R-CL}y+FdgPlfuDeYM3i?t%y*wOsK>ZcRU}c-juTcPi zWs2|J$8Q{SwlcRf=lSgp`rdtP=_o-dWXRq#?9n`U1~n6PC4cDhYO0*VYV<+5-q^DH z-T9Xmu9kxQ%9|9{(Xz53{F^(4-&JGtE!0 zUtT4J7nC0KUr06h{GJ>fH3ji>gQ*gnrS!&^9gd>X{k&fX7=| zWDlE;kDcYc{2!#`y?p`N;dhUCK87hyDAk`LOoSqOr_d%Kt(ThcGhSsBsw4A-l{7Ug zE{K!3;j_Eo!E%YMp+PeR8@{@N6{a)rAbundDIOLS%jW-tyGq=urp$Kz0^7HUuXJJO zD!kuRkqfcP;5Lc~;hse<``+4y-t>Ch&I1Yn*W2Lm}cLr%4zR$%IMv+znQ|Gg}l; z2J~!9Y$0Xs3KFZY3So;Luh0+Q_2NbW3`qnLN5-bGK*GGbmZ+mT{7rG0gx3g2tRrVK zYcPC{prfAF)_d--MZ}i|`{s}3Z2WV1tg=PsgPX=VBn%hw>BP;Tx_7a0 zI-Ii{d{5U9=Mzp8jQ&$E^xpZwm*`;{l@A`@PQ$GmIG21bKgAn)Up0`U9aX=%ZckG2 zes;Xl{wU+>YHk_hG_h%!-cy=^ewfQK;MfDH=2mBIN_@F6O*?DbSXL9=ZmIFyqby*9 zU}rJLBva|ZHii|KX499-^Qh8Ra34`gj3+|`UXkPpCyS0QcG4YsvHyn}=n-5=NPs4WGYA3kshMv0OSM;eA& zWYZ^3qPy+830tZwDkk{_?Hulfbofk;-3l!A&Mta8FX&Rw@zQov?+t?1^NE2_t@l_b zG`5Sg?j$?-g)bZly(T{_96@(Vb{2ZAzb(}tXRAu#PUd949PJ;Y=AxP@NGQ)r;dfL| zd1ugaEO4tSJD(p_D~@I<4o`ej9t8o18*gfewt4)aMqn!*3-8k8-G0Uxwl5Od0Sd_% z{fF>oEq$rlzpBRK9A#vHAUC^qjo^vS;XI?f4hl~; z(nsY)hVbzAZd2#6y_u2+y;9=JBJ)I1)4L#f+D*0Slw@RwN2?qrqgP|5LYnVGEJXSH zxco&vrfGjj!g4*6pwFG@;HxS!G0bCYGUe*tT(I)fzMd<+UODqx@D&m!2CQhYds>rh?LXlMAY{|;B( zQeaB+v`b$cUcpzbBrC0M5_@vBQkVyOA!?20rhSCy2e$kbo~{H_^6Npx;d69-aIvy8 zl4q?B3E77HhP!@Yj1;N-{Y_jXES!8diJGP(-iDxD|8u(oqZp~ng^9FPlOFPc7uFdl zvf!y`4OX;iSPi6Nhsff{d8dL?r>6^|A!+bJySj5F=1uIjpUqR~2^g)&zD{li1#25} z^=|MnawPEX3elJT4am278=J+9bWd9lOx2!72&lAnt>5k`roHtlaX!ZK8FU!$(iCJN zSosQI5?GS?GV#*pmBT>EgllnJd!$xArF6M1h;t37ia4q0P+v^T;>qr5@qu%gVl!k% z3M#WZ7i@v@7^vgbjYBPaTee;}sJWYa1~O7yY%@A6_l#?{n8$sje>L5qW@3wTPrRV= z;8P#*`q+|;i0ILqWt;k%v2ncVcqR;m(JfjtM6>ti9NzdHohNidv%Ak*BiJ6tp}$`v zAhYSXWXA zA^d@=0O=<%G*&Bf$Y)4GeJj>7%nFrO83B}OjFN*9CI(K%XAYKj1MyXX_zEjPe7t)g z0mW5gQ!}ucdru$G5r7Iq)K;M2hehHk4ze4fg@P#RBt`QYAbVFeH?2+G^L{`e{<$qY8qFdJS3I1BKq#$xo-zzG&{1&YrHEDJeE?1so~c zde>ZeZ`@UEqzpK^iq^#Snf->R&ckBTXrVbgX*lN|QIr(^WOIcCDVu^fT5s1}9+c8E{X3pw9UIF*v*WMY-U!+1c$sa7j9 z-dbDSoK&(~@V)*;Ie*7(0*?+pLj@m1@BuI9ovch8%^~W}j*smvf2(s-2D_TOnwkJD zEe%+l66^tV@pMUa&rJ~iN6oxGDMAQ_pRftMX2yqimJp~J*b(aB#A60^H2*_6{jZ^f z@3m)~qza6eENF-M3U#Cs+%v_8{zkeVXOkd@Eb?vsWlgN)Y?Wq>57ZA^M}T_vRn=NdWE3@lDJe~Y0hp6&lVf#x^TsmMQTBj`Cc>?P+C4>iwa91IwAl=C zl?X-RF5?Kotzb#)7IH;If1f%MXS*x0A*LJ)gBh43RAe)1I_>Dw(0r;RtB?|*K)iD6 zH^PhePdzu%H@~? zx%+Qz#7LYm01=rs5fK#Q`=HCm8ztx^82}pv2Z#S?06&pGuCzEw=qxW*;12MZ z25XMu&Q~y&U$LpI=;0d^gt!_@rF^rj3dWG)*saOvn%ItM*O|zbXYOaxa&uH7Mu#!O z?cmVWY$9YN3LDbn=k7CJjc1$~jD*(9X1?;22<)qFC@uM9B8|9ryfS5|3o-GWC%=qDExOKKQ z>3Yd^i`s-hi#~{^GIRum1;dcbR`fL#-y)wUQc<}Stu$!r*@zY2(Z1cm!0JI=_>i}C zM5x1j>{Wp#ZlWpo(B)FKLHJUEB$)&IaO##KZ3nrnI{$8eHIG&>~oZj#Cc#hCaf0<$; zjN)u~w4m?aUQiW&60}$~(3>?=!z|U$<1jJ^JnBmA^G4Ho%e<8(?r}vD)ajG%U28bc z-IE%(;3A~Hf01+qg&)N4{6RqEf@gtW{|WZzasB!H#b07zg`WX_*5dy({Qj(ibK-Bx z{7u83#mT=LF2UR4|1DMC#JMR}{Xil||0^Q=DR$m8zN!8EFh;}pJyHIp1l=^fDPjCD z^~C(^d;dQ{<0jxuMc@aZH`Z@}H}!#=05{979{{>=h2iJe_@fBBiE^`A_<^zkFCKnX zJKRM0xj6V?4*-Y}0|5Uh6>gf}4C+4sT1js%<7SAziS=`Q{(%GlNW%-$zkbRe(Hg9P V40k;M01N)2f_pCzuFU`d{|ED|w^{%I literal 0 HcmV?d00001 diff --git a/pandas/tests/io/excel/conftest.py b/pandas/tests/io/excel/conftest.py index a257735dc1ec5..0455e0d61ad97 100644 --- a/pandas/tests/io/excel/conftest.py +++ b/pandas/tests/io/excel/conftest.py @@ -35,7 +35,7 @@ def df_ref(datapath): return df_ref -@pytest.fixture(params=[".xls", ".xlsx", ".xlsm", ".ods"]) +@pytest.fixture(params=[".xls", ".xlsx", ".xlsm", ".ods", ".xlsb"]) def read_ext(request): """ Valid extensions for reading Excel files. diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 629d3d02028bd..f8ff3567b8b64 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -31,7 +31,7 @@ def ignore_xlrd_time_clock_warning(): yield -read_ext_params = [".xls", ".xlsx", ".xlsm", ".ods"] +read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"] engine_params = [ # Add any engines to test here # When defusedxml is installed it triggers deprecation warnings for @@ -57,6 +57,7 @@ def ignore_xlrd_time_clock_warning(): pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), ], ), + pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")), pytest.param("odf", marks=td.skip_if_no("odf")), ] @@ -73,6 +74,10 @@ def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool: return False if read_ext == ".ods" and engine != "odf": return False + if engine == "pyxlsb" and read_ext != ".xlsb": + return False + if read_ext == ".xlsb" and engine != "pyxlsb": + return False return True @@ -120,7 +125,6 @@ def cd_and_set_engine(self, engine, datapath, monkeypatch): """ Change directory and set engine for read_excel calls. """ - func = partial(pd.read_excel, engine=engine) monkeypatch.chdir(datapath("io", "data", "excel")) monkeypatch.setattr(pd, "read_excel", func) @@ -142,6 +146,8 @@ def test_usecols_int(self, read_ext, df_ref): ) def test_usecols_list(self, read_ext, df_ref): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") df_ref = df_ref.reindex(columns=["B", "C"]) df1 = pd.read_excel( @@ -156,6 +162,8 @@ def test_usecols_list(self, read_ext, df_ref): tm.assert_frame_equal(df2, df_ref, check_names=False) def test_usecols_str(self, read_ext, df_ref): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") df1 = df_ref.reindex(columns=["A", "B", "C"]) df2 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, usecols="A:D") @@ -188,6 +196,9 @@ def test_usecols_str(self, read_ext, df_ref): "usecols", [[0, 1, 3], [0, 3, 1], [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0]] ) def test_usecols_diff_positional_int_columns_order(self, read_ext, usecols, df_ref): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + expected = df_ref[["A", "C"]] result = pd.read_excel( "test1" + read_ext, "Sheet1", index_col=0, usecols=usecols @@ -203,11 +214,17 @@ def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_r tm.assert_frame_equal(result, expected, check_names=False) def test_read_excel_without_slicing(self, read_ext, df_ref): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + expected = df_ref result = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0) tm.assert_frame_equal(result, expected, check_names=False) def test_usecols_excel_range_str(self, read_ext, df_ref): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + expected = df_ref[["C", "D"]] result = pd.read_excel( "test1" + read_ext, "Sheet1", index_col=0, usecols="A,D:E" @@ -274,12 +291,16 @@ def test_excel_stop_iterator(self, read_ext): tm.assert_frame_equal(parsed, expected) def test_excel_cell_error_na(self, read_ext): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") parsed = pd.read_excel("test3" + read_ext, "Sheet1") expected = DataFrame([[np.nan]], columns=["Test"]) tm.assert_frame_equal(parsed, expected) def test_excel_table(self, read_ext, df_ref): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") df1 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0) df2 = pd.read_excel("test1" + read_ext, "Sheet2", skiprows=[1], index_col=0) @@ -291,6 +312,8 @@ def test_excel_table(self, read_ext, df_ref): tm.assert_frame_equal(df3, df1.iloc[:-1]) def test_reader_special_dtypes(self, read_ext): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") expected = DataFrame.from_dict( OrderedDict( @@ -488,6 +511,9 @@ def test_read_excel_blank_with_header(self, read_ext): def test_date_conversion_overflow(self, read_ext): # GH 10001 : pandas.ExcelFile ignore parse_dates=False + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + expected = pd.DataFrame( [ [pd.Timestamp("2016-03-12"), "Marc Johnson"], @@ -504,9 +530,14 @@ def test_date_conversion_overflow(self, read_ext): tm.assert_frame_equal(result, expected) def test_sheet_name(self, read_ext, df_ref): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") filename = "test1" sheet_name = "Sheet1" + if pd.read_excel.keywords["engine"] == "openpyxl": + pytest.xfail("Maybe not supported by openpyxl") + df1 = pd.read_excel( filename + read_ext, sheet_name=sheet_name, index_col=0 ) # doc @@ -531,6 +562,10 @@ def test_bad_engine_raises(self, read_ext): @tm.network def test_read_from_http_url(self, read_ext): + if read_ext == ".xlsb": + pytest.xfail("xlsb files not present in master repo yet") + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") url = ( "https://raw.githubusercontent.com/pandas-dev/pandas/master/" @@ -599,6 +634,8 @@ def test_read_from_py_localpath(self, read_ext): tm.assert_frame_equal(expected, actual) def test_reader_seconds(self, read_ext): + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") # Test reading times with and without milliseconds. GH5945. expected = DataFrame.from_dict( @@ -627,6 +664,9 @@ def test_reader_seconds(self, read_ext): def test_read_excel_multiindex(self, read_ext): # see gh-4679 + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]]) mi_file = "testmultiindex" + read_ext @@ -786,6 +826,9 @@ def test_read_excel_chunksize(self, read_ext): def test_read_excel_skiprows_list(self, read_ext): # GH 4903 + if pd.read_excel.keywords["engine"] == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + actual = pd.read_excel( "testskiprows" + read_ext, "skiprows_list", skiprows=[0, 2] ) @@ -851,13 +894,11 @@ def cd_and_set_engine(self, engine, datapath, monkeypatch): """ Change directory and set engine for ExcelFile objects. """ - func = partial(pd.ExcelFile, engine=engine) monkeypatch.chdir(datapath("io", "data", "excel")) monkeypatch.setattr(pd, "ExcelFile", func) def test_excel_passes_na(self, read_ext): - with pd.ExcelFile("test4" + read_ext) as excel: parsed = pd.read_excel( excel, "Sheet1", keep_default_na=False, na_values=["apple"] @@ -928,6 +969,10 @@ def test_unexpected_kwargs_raises(self, read_ext, arg): pd.read_excel(excel, **kwarg) def test_excel_table_sheet_by_index(self, read_ext, df_ref): + # For some reason pd.read_excel has no attribute 'keywords' here. + # Skipping based on read_ext instead. + if read_ext == ".xlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") with pd.ExcelFile("test1" + read_ext) as excel: df1 = pd.read_excel(excel, 0, index_col=0) @@ -951,6 +996,11 @@ def test_excel_table_sheet_by_index(self, read_ext, df_ref): tm.assert_frame_equal(df3, df1.iloc[:-1]) def test_sheet_name(self, read_ext, df_ref): + # For some reason pd.read_excel has no attribute 'keywords' here. + # Skipping based on read_ext instead. + if read_ext == ".xlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + filename = "test1" sheet_name = "Sheet1" diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index d1f900a2dc58b..cc7e2311f362a 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -10,9 +10,11 @@ @pytest.fixture(autouse=True) -def skip_ods_files(read_ext): +def skip_ods_and_xlsb_files(read_ext): if read_ext == ".ods": pytest.skip("Not valid for xlrd") + if read_ext == ".xlsb": + pytest.skip("Not valid for xlrd") def test_read_xlrd_book(read_ext, frame): From bbcda98c7974ba5320174ba6be117d399c15603e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 20 Jan 2020 17:06:09 -0800 Subject: [PATCH 137/158] TST: Add regression tests for fixed issues (#31161) * TST: Add tests for fixed issues * Platform compat test * Use range instead * Address comments --- pandas/tests/frame/test_constructors.py | 18 +++++++++++++ pandas/tests/groupby/test_apply.py | 19 +++++++++++++ pandas/tests/groupby/test_groupby.py | 7 +++++ pandas/tests/indexing/test_loc.py | 10 +++++++ pandas/tests/io/test_pickle.py | 15 +++++++++++ pandas/tests/series/test_constructors.py | 9 +++++++ pandas/tests/test_multilevel.py | 34 ++++++++++++++++++++++++ 7 files changed, 112 insertions(+) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index a861e0eb52391..b1620df91ba26 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2433,6 +2433,24 @@ def test_datetime_date_tuple_columns_from_dict(self): expected = DataFrame([0, 1, 2], columns=pd.Index(pd.Series([tup]))) tm.assert_frame_equal(result, expected) + def test_construct_with_two_categoricalindex_series(self): + # GH 14600 + s1 = pd.Series( + [39, 6, 4], index=pd.CategoricalIndex(["female", "male", "unknown"]) + ) + s2 = pd.Series( + [2, 152, 2, 242, 150], + index=pd.CategoricalIndex(["f", "female", "m", "male", "unknown"]), + ) + result = pd.DataFrame([s1, s2]) + expected = pd.DataFrame( + np.array( + [[np.nan, 39.0, np.nan, 6.0, 4.0], [2.0, 152.0, 2.0, 242.0, 150.0]] + ), + columns=["f", "female", "m", "male", "unknown"], + ) + tm.assert_frame_equal(result, expected) + class TestDataFrameConstructorWithDatetimeTZ: def test_from_dict(self): diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 708d3429285a8..fc7b9f56002d8 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -792,3 +792,22 @@ def test_apply_multi_level_name(category): ) tm.assert_frame_equal(result, expected) assert df.index.names == ["A", "B"] + + +def test_groupby_apply_datetime_result_dtypes(): + # GH 14849 + data = pd.DataFrame.from_records( + [ + (pd.Timestamp(2016, 1, 1), "red", "dark", 1, "8"), + (pd.Timestamp(2015, 1, 1), "green", "stormy", 2, "9"), + (pd.Timestamp(2014, 1, 1), "blue", "bright", 3, "10"), + (pd.Timestamp(2013, 1, 1), "blue", "calm", 4, "potato"), + ], + columns=["observation", "color", "mood", "intensity", "score"], + ) + result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes + expected = Series( + [np.dtype("datetime64[ns]"), np.object, np.object, np.int64, np.object], + index=["observation", "color", "mood", "intensity", "score"], + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7e374811d1960..eb9552fbbebc1 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1952,6 +1952,13 @@ def test_shift_bfill_ffill_tz(tz_naive_fixture, op, expected): tm.assert_frame_equal(result, expected) +def test_ffill_missing_arguments(): + # GH 14955 + df = pd.DataFrame({"a": [1, 2], "b": [1, 1]}) + with pytest.raises(ValueError, match="Must specify a fill"): + df.groupby("b").fillna() + + def test_groupby_only_none_group(): # see GH21624 # this was crashing with "ValueError: Length of passed values is 1, index implies 0" diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 78fcd15ab4cc1..4c1436b800fc3 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1002,3 +1002,13 @@ def test_loc_axis_1_slice(): ), ) tm.assert_frame_equal(result, expected) + + +def test_loc_set_dataframe_multiindex(): + # GH 14592 + expected = pd.DataFrame( + "a", index=range(2), columns=pd.MultiIndex.from_product([range(2), range(2)]) + ) + result = expected.copy() + result.loc[0, [(0, 1)]] = result.loc[0, [(0, 1)]] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 3d427dde573af..22c4e38206df6 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -11,6 +11,7 @@ 3. Move the created pickle to "data/legacy_pickle/" directory. """ import bz2 +import datetime import glob import gzip import os @@ -487,3 +488,17 @@ def open(self, *args): df.to_pickle(mockurl) result = pd.read_pickle(mockurl) tm.assert_frame_equal(df, result) + + +class MyTz(datetime.tzinfo): + def __init__(self): + pass + + +def test_read_pickle_with_subclass(): + # GH 12163 + expected = pd.Series(dtype=object), MyTz() + result = tm.round_trip_pickle(expected) + + tm.assert_series_equal(result[0], expected[0]) + assert isinstance(result[1], MyTz) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index d760939657d47..2651c3d73c9ab 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1115,6 +1115,15 @@ def create_data(constructor): tm.assert_series_equal(result_datetime, expected) tm.assert_series_equal(result_Timestamp, expected) + def test_contructor_dict_tuple_indexer(self): + # GH 12948 + data = {(1, 1, None): -1.0} + result = Series(data) + expected = Series( + -1.0, index=MultiIndex(levels=[[1], [1], [np.nan]], codes=[[0], [0], [-1]]) + ) + tm.assert_series_equal(result, expected) + def test_constructor_mapping(self, non_mapping_dict_subclass): # GH 29788 ndm = non_mapping_dict_subclass({3: "three"}) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 5382ad84bcca2..1adc5011a0c31 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2147,6 +2147,40 @@ def test_sort_index_level_mixed(self): sorted_after.drop([("foo", "three")], axis=1), ) + def test_sort_index_categorical_multiindex(self): + # GH 15058 + df = DataFrame( + { + "a": range(6), + "l1": pd.Categorical( + ["a", "a", "b", "b", "c", "c"], + categories=["c", "a", "b"], + ordered=True, + ), + "l2": [0, 1, 0, 1, 0, 1], + } + ) + result = df.set_index(["l1", "l2"]).sort_index() + expected = DataFrame( + [4, 5, 0, 1, 2, 3], + columns=["a"], + index=MultiIndex( + levels=[ + pd.CategoricalIndex( + ["c", "a", "b"], + categories=["c", "a", "b"], + ordered=True, + name="l1", + dtype="category", + ), + [0, 1], + ], + codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + names=["l1", "l2"], + ), + ) + tm.assert_frame_equal(result, expected) + def test_is_lexsorted(self): levels = [[0, 1], [0, 1, 2]] From bfe10f08696d0073db27bac898d6629fbef09f5d Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Tue, 21 Jan 2020 10:46:12 +0800 Subject: [PATCH 138/158] TST: added more tests (GH26996) --- doc/source/whatsnew/v1.0.0.rst | 1 + doc/source/whatsnew/v1.1.0.rst | 3 ++- pandas/tests/window/test_rolling.py | 36 +++++++++++++++++++++++++---- 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 3bd86bb02155f..00bfe5af828dc 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -1131,6 +1131,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` when using nunique on axis=1 (:issue:`30253`) - Bug in :meth:`GroupBy.quantile` with multiple list-like q value and integer column names (:issue:`30289`) - Bug in :meth:`GroupBy.pct_change` and :meth:`core.groupby.SeriesGroupBy.pct_change` causes ``TypeError`` when ``fill_method`` is ``None`` (:issue:`30463`) +- Bug in :meth:`Rolling.count` and :meth:`Expanding.count` argument ``min_periods`` ignored (:issue:`26996`) Reshaping ^^^^^^^^^ diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index a979b12c18f16..2711db0e60eaa 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -133,7 +133,8 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in :meth:`Rolling.count` and :meth:`Expanding.count` argument ``min_periods`` ignored (:issue:`26996`) +- +- Reshaping ^^^^^^^^^ diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 719809dc8cfd6..b1ba19e4b7688 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -448,8 +448,36 @@ def test_min_periods1(): tm.assert_series_equal(result, expected) -def test_rolling_count_with_min_periods(): +@pytest.mark.parametrize("test_series", [True, False]) +def test_rolling_count_with_min_periods(test_series): # GH 26996 - result = Series(range(5)).rolling(3, min_periods=3).count() - expected = Series([np.nan, np.nan, 3.0, 3.0, 3.0]) - tm.assert_series_equal(result, expected) + if test_series: + result = Series(range(5)).rolling(3, min_periods=3).count() + expected = Series([np.nan, np.nan, 3.0, 3.0, 3.0]) + tm.assert_series_equal(result, expected) + else: + result = DataFrame(range(5)).rolling(3, min_periods=3).count() + expected = DataFrame([np.nan, np.nan, 3.0, 3.0, 3.0]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("test_series", [True, False]) +def test_rolling_count_default_min_periods_with_null_values(test_series): + # GH 26996 + # We need rolling count to have default min_periods=0, + # as the method is meant to count how many non-null values, + # we want to by default produce a valid count even if + # there are very few valid entries in the window + values = [1, 2, 3, np.nan, 4, 5, 6] + expected_counts = [1.0, 2.0, 3.0, 2.0, 2.0, 2.0, 3.0] + + if test_series: + ser = Series(values) + result = ser.rolling(3).count() + expected = Series(expected_counts) + tm.assert_series_equal(result, expected) + else: + df = DataFrame(values) + result = df.rolling(3).count() + expected = DataFrame(expected_counts) + tm.assert_frame_equal(result, expected) From 964400d20fdebe97868e511fc128be9188f56064 Mon Sep 17 00:00:00 2001 From: rebecca-palmer Date: Tue, 21 Jan 2020 10:18:38 +0000 Subject: [PATCH 139/158] Remove possibly illegal test data (#31146) --- .../io/data/html/computer_sales_page.html | 619 --- pandas/tests/io/data/html/macau.html | 3691 ----------------- pandas/tests/io/data/html/nyse_wsj.html | 1207 ------ pandas/tests/io/test_html.py | 68 +- 4 files changed, 12 insertions(+), 5573 deletions(-) delete mode 100644 pandas/tests/io/data/html/computer_sales_page.html delete mode 100644 pandas/tests/io/data/html/macau.html delete mode 100644 pandas/tests/io/data/html/nyse_wsj.html diff --git a/pandas/tests/io/data/html/computer_sales_page.html b/pandas/tests/io/data/html/computer_sales_page.html deleted file mode 100644 index ff2b031b58d64..0000000000000 --- a/pandas/tests/io/data/html/computer_sales_page.html +++ /dev/null @@ -1,619 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     
     Three months ended
    -April 30
     Six months ended
    -April 30
     
     
     2013  2012  2013  2012  
     
     In millions
     

    Net revenue:

                 

    Notebooks

     $3,718 $4,900 $7,846 $9,842 

    Desktops

      3,103  3,827  6,424  7,033 

    Workstations

      521  537  1,056  1,072 

    Other

      242  206  462  415 
              

    Personal Systems

      7,584  9,470  15,788  18,362 
              

    Supplies

      4,122  4,060  8,015  8,139 

    Commercial Hardware

      1,398  1,479  2,752  2,968 

    Consumer Hardware

      561  593  1,240  1,283 
              

    Printing

      6,081  6,132  12,007  12,390 
              

    Printing and Personal Systems Group

      13,665  15,602  27,795  30,752 
              

    Industry Standard Servers

      2,806  3,186  5,800  6,258 

    Technology Services

      2,272  2,335  4,515  4,599 

    Storage

      857  990  1,690  1,945 

    Networking

      618  614  1,226  1,200 

    Business Critical Systems

      266  421  572  826 
              

    Enterprise Group

      6,819  7,546  13,803  14,828 
              

    Infrastructure Technology Outsourcing

      3,721  3,954  7,457  7,934 

    Application and Business Services

      2,278  2,535  4,461  4,926 
              

    Enterprise Services

      5,999  6,489  11,918  12,860 
              

    Software

      941  970  1,867  1,916 

    HP Financial Services

      881  968  1,838  1,918 

    Corporate Investments

      10  7  14  37 
              

    Total segments

      28,315  31,582  57,235  62,311 
              

    Eliminations of intersegment net revenue and other

      (733) (889) (1,294) (1,582)
              

    Total HP consolidated net revenue

     $27,582 $30,693 $55,941 $60,729 
              
    diff --git a/pandas/tests/io/data/html/macau.html b/pandas/tests/io/data/html/macau.html deleted file mode 100644 index edc4ea96f0f20..0000000000000 --- a/pandas/tests/io/data/html/macau.html +++ /dev/null @@ -1,3691 +0,0 @@ - - - - - - - - - - - - - - - -Traffic Statistics - Passengers - - - - -
    -
    - - -
    - -
    - - - - - - - - - - - - - - -
    -
    - - -
    - -
    -
    -

    Traffic Statistics - Passengers

    - -
    -
    -
    - - -
    - -
    -
    -
    -
    - - - Traffic Statistics - - - - - -


    - Passengers Figure(2008-2013)

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      201320122011201020092008
    January - - 374,917 - - - 362,379 - - - 301,503 - - - 358,902 - - - 342,323 - - - 420,574 -
    February - - 393,152 - - - 312,405 - - - 301,259 - - - 351,654 - - - 297,755 - - - 442,809 -
    March - - 408,755 - - - 334,000 - - - 318,908 - - - 360,365 - - - 387,879 - - - 468,540 -
    April - - 408,860 - - - 358,198 - - - 339,060 - - - 352,976 - - - 400,553 - - - 492,930 -
    May - - 374,397 - - - 329,218 - - - 321,060 - - - 330,407 - - - 335,967 - - - 465,045 -
    June - - 401,995 - - - 356,679 - - - 343,006 - - - 326,724 - - - 296,748 - - - 426,764 -
    July - - - - - 423,081 - - - 378,993 - - - 356,580 - - - 351,110 - - - 439,425 -
    August - - - - - 453,391 - - - 395,883 - - - 364,011 - - - 404,076 - - - 425,814 -
    September - - - - - 384,887 - - - 325,124 - - - 308,940 - - - 317,226 - - - 379,898 -
    October - - - - - 383,889 - - - 333,102 - - - 317,040 - - - 355,935 - - - 415,339 -
    November - - - - - 379,065 - - - 327,803 - - - 303,186 - - - 372,104 - - - 366,411 -
    December - - - - - 413,873 - - - 359,313 - - - 348,051 - - - 388,573 - - - 354,253 -
    Total - - 2,362,076 - - - 4,491,065 - - - 4,045,014 - - - 4,078,836 - - - 4,250,249 - - - 5,097,802 -
    - -


    - Passengers Figure(2002-2007)

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      200720062005200420032002
    January - - 381,887 - - - 323,282 - - - 289,701 - - - 288,507 - - - 290,140 - - - 268,783 -
    February - - 426,014 - - - 360,820 - - - 348,723 - - - 207,710 - - - 323,264 - - - 323,654 -
    March - - 443,805 - - - 389,125 - - - 321,953 - - - 273,910 - - - 295,052 - - - 360,668 -
    April - - 500,917 - - - 431,550 - - - 367,976 - - - 324,931 - - - 144,082 - - - 380,648 -
    May - - 468,637 - - - 399,743 - - - 359,298 - - - 250,601 - - - 47,333 - - - 359,547 -
    June - - 463,676 - - - 393,713 - - - 360,147 - - - 296,000 - - - 94,294 - - - 326,508 -
    July - - 490,404 - - - 465,497 - - - 413,131 - - - 365,454 - - - 272,784 - - - 388,061 -
    August - - 490,830 - - - 478,474 - - - 409,281 - - - 372,802 - - - 333,840 - - - 384,719 -
    September - - 446,594 - - - 412,444 - - - 354,751 - - - 321,456 - - - 295,447 - - - 334,029 -
    October - - 465,757 - - - 461,215 - - - 390,435 - - - 358,362 - - - 291,193 - - - 372,706 -
    November - - 455,132 - - - 425,116 - - - 323,347 - - - 327,593 - - - 268,282 - - - 350,324 -
    December - - 465,225 - - - 435,114 - - - 308,999 - - - 326,933 - - - 249,855 - - - 322,056 -
    Total - - 5,498,878 - - - 4,976,093 - - - 4,247,742 - - - 3,714,259 - - - 2,905,566 - - - 4,171,703 -
    - -


    - Passengers Figure(1996-2001)

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      200120001999199819971996
    January - - 265,603 - - - 184,381 - - - 161,264 - - - 161,432 - - - 117,984 - - - -
    February - - 249,259 - - - 264,066 - - - 209,569 - - - 168,777 - - - 150,772 - - - -
    March - - 312,319 - - - 226,483 - - - 186,965 - - - 172,060 - - - 149,795 - - - -
    April - - 351,793 - - - 296,541 - - - 237,449 - - - 180,241 - - - 179,049 - - - -
    May - - 338,692 - - - 288,949 - - - 230,691 - - - 172,391 - - - 189,925 - - - -
    June - - 332,630 - - - 271,181 - - - 231,328 - - - 157,519 - - - 175,402 - - - -
    July - - 344,658 - - - 304,276 - - - 243,534 - - - 205,595 - - - 173,103 - - - -
    August - - 360,899 - - - 300,418 - - - 257,616 - - - 241,140 - - - 178,118 - - - -
    September - - 291,817 - - - 280,803 - - - 210,885 - - - 183,954 - - - 163,385 - - - -
    October - - 327,232 - - - 298,873 - - - 231,251 - - - 205,726 - - - 176,879 - - - -
    November - - 315,538 - - - 265,528 - - - 228,637 - - - 181,677 - - - 146,804 - - - -
    December - - 314,866 - - - 257,929 - - - 210,922 - - - 183,975 - - - 151,362 - - - -
    Total - - 3,805,306 - - - 3,239,428 - - - 2,640,111 - - - 2,214,487 - - - 1,952,578 - - - 0 -
    - -


    - Passengers Figure(1995-1995)

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      1995
    January - - -
    February - - -
    March - - -
    April - - -
    May - - -
    June - - -
    July - - -
    August - - -
    September - - -
    October - - -
    November - - 6,601 -
    December - - 37,041 -
    Total - - 43,642 -
    - - -


    -
    passenger statistic picture
    -


    - - - - -


    - Movement Statistics(2008-2013)

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      201320122011201020092008
    January - - 3,925 - - - 3,463 - - - 3,289 - - - 3,184 - - - 3,488 - - - 4,568 -
    February - - 3,632 - - - 2,983 - - - 2,902 - - - 3,053 - - - 3,347 - - - 4,527 -
    March - - 3,909 - - - 3,166 - - - 3,217 - - - 3,175 - - - 3,636 - - - 4,594 -
    April - - 3,903 - - - 3,258 - - - 3,146 - - - 3,023 - - - 3,709 - - - 4,574 -
    May - - 4,075 - - - 3,234 - - - 3,266 - - - 3,033 - - - 3,603 - - - 4,511 -
    June - - 4,038 - - - 3,272 - - - 3,316 - - - 2,909 - - - 3,057 - - - 4,081 -
    July - - - - - 3,661 - - - 3,359 - - - 3,062 - - - 3,354 - - - 4,215 -
    August - - - - - 3,942 - - - 3,417 - - - 3,077 - - - 3,395 - - - 4,139 -
    September - - - - - 3,703 - - - 3,169 - - - 3,095 - - - 3,100 - - - 3,752 -
    October - - - - - 3,727 - - - 3,469 - - - 3,179 - - - 3,375 - - - 3,874 -
    November - - - - - 3,722 - - - 3,145 - - - 3,159 - - - 3,213 - - - 3,567 -
    December - - - - - 3,866 - - - 3,251 - - - 3,199 - - - 3,324 - - - 3,362 -
    Total - - 23,482 - - - 41,997 - - - 38,946 - - - 37,148 - - - 40,601 - - - 49,764 -
    - -


    - Movement Statistics(2002-2007)

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      200720062005200420032002
    January - - 4,384 - - - 3,933 - - - 3,528 - - - 3,051 - - - 3,257 - - - 2,711 -
    February - - 4,131 - - - 3,667 - - - 3,331 - - - 2,372 - - - 3,003 - - - 2,747 -
    March - - 4,349 - - - 4,345 - - - 3,549 - - - 3,049 - - - 3,109 - - - 2,985 -
    April - - 4,460 - - - 4,490 - - - 3,832 - - - 3,359 - - - 2,033 - - - 2,928 -
    May - - 4,629 - - - 4,245 - - - 3,663 - - - 3,251 - - - 1,229 - - - 3,109 -
    June - - 4,365 - - - 4,124 - - - 3,752 - - - 3,414 - - - 1,217 - - - 3,049 -
    July - - 4,612 - - - 4,386 - - - 3,876 - - - 3,664 - - - 2,423 - - - 3,078 -
    August - - 4,446 - - - 4,373 - - - 3,987 - - - 3,631 - - - 3,040 - - - 3,166 -
    September - - 4,414 - - - 4,311 - - - 3,782 - - - 3,514 - - - 2,809 - - - 3,239 -
    October - - 4,445 - - - 4,455 - - - 3,898 - - - 3,744 - - - 3,052 - - - 3,562 -
    November - - 4,563 - - - 4,285 - - - 3,951 - - - 3,694 - - - 3,125 - - - 3,546 -
    December - - 4,588 - - - 4,435 - - - 3,855 - - - 3,763 - - - 2,996 - - - 3,444 -
    Total - - 53,386 - - - 51,049 - - - 45,004 - - - 40,506 - - - 31,293 - - - 37,564 -
    - -


    - Movement Statistics(1996-2001)

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      200120001999199819971996
    January - - 2,694 - - - 2,201 - - - 1,835 - - - 2,177 - - - 1,353 - - - 744 -
    February - - 2,364 - - - 2,357 - - - 1,826 - - - 1,740 - - - 1,339 - - - 692 -
    March - - 2,543 - - - 2,206 - - - 1,895 - - - 1,911 - - - 1,533 - - - 872 -
    April - - 2,531 - - - 2,311 - - - 2,076 - - - 1,886 - - - 1,587 - - - 1,026 -
    May - - 2,579 - - - 2,383 - - - 1,914 - - - 2,102 - - - 1,720 - - - 1,115 -
    June - - 2,681 - - - 2,370 - - - 1,890 - - - 2,038 - - - 1,716 - - - 1,037 -
    July - - 2,903 - - - 2,609 - - - 1,916 - - - 2,078 - - - 1,693 - - - 1,209 -
    August - - 3,037 - - - 2,487 - - - 1,968 - - - 2,061 - - - 1,676 - - - 1,241 -
    September - - 2,767 - - - 2,329 - - - 1,955 - - - 1,970 - - - 1,681 - - - 1,263 -
    October - - 2,922 - - - 2,417 - - - 2,267 - - - 1,969 - - - 1,809 - - - 1,368 -
    November - - 2,670 - - - 2,273 - - - 2,132 - - - 2,102 - - - 1,786 - - - 1,433 -
    December - - 2,815 - - - 2,749 - - - 2,187 - - - 1,981 - - - 1,944 - - - 1,386 -
    Total - - 32,506 - - - 28,692 - - - 23,861 - - - 24,015 - - - 19,837 - - - 13,386 -
    - -


    - Movement Statistics(1995-1995)

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      1995
    January - - -
    February - - -
    March - - -
    April - - -
    May - - -
    June - - -
    July - - -
    August - - -
    September - - -
    October - - -
    November - - 126 -
    December - - 536 -
    Total - - 662 -
    - - -


    -
    passenger statistic picture
    - - -
    - -
    -
    -
    - - - -
    -
    - -
    - -
    - - - -
    - - - -
    -
    - - \ No newline at end of file diff --git a/pandas/tests/io/data/html/nyse_wsj.html b/pandas/tests/io/data/html/nyse_wsj.html deleted file mode 100644 index 2360bd49e9950..0000000000000 --- a/pandas/tests/io/data/html/nyse_wsj.html +++ /dev/null @@ -1,1207 +0,0 @@ - - - - - - -
    -
    -
    -
    -
    - SEARCH -
    -
    -
    - - -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     Issue(Roll over for charts and headlines) - VolumePriceChg% Chg
    1 - J.C. Penney (JCP) - - 250,697,455$9.05-1.37-13.15
    2 - Bank of America (BAC) - - 77,162,10313.90-0.18-1.28
    3 - Rite Aid (RAD) - - 52,140,3824.70-0.08-1.67
    4 - Ford Motor (F) - - 33,745,28717.05-0.22-1.27
    5 - Pfizer (PFE) - - 27,801,85328.880.361.26
    6 - Hertz Global Hldgs (HTZ) - - 25,821,26422.320.693.19
    7 - General Electric (GE) - - 25,142,06424.05-0.20-0.82
    8 - Elan ADS (ELN) - - 24,725,20915.590.080.52
    9 - JPMorgan Chase (JPM) - - 22,402,75652.240.350.67
    10 - Regions Financial (RF) - - 20,790,5329.300.121.31
    11 - Violin Memory (VMEM) - - 20,669,8467.02-1.98-22.00
    12 - Citigroup (C) - - 19,979,93248.89-0.04-0.08
    13 - Nokia ADS (NOK) - - 19,585,0756.660.020.30
    14 - Wells Fargo (WFC) - - 19,478,59041.59-0.02-0.05
    15 - Vale ADS (VALE) - - 18,781,98715.60-0.52-3.23
    16 - Delta Air Lines (DAL) - - 16,013,95623.57-0.44-1.83
    17 - EMC (EMC) - - 15,771,25226.07-0.11-0.42
    18 - Nike Cl B (NKE) - - 15,514,71773.643.304.69
    19 - Alcoa (AA) - - 14,061,0738.20-0.07-0.85
    20 - General Motors (GM) - - 13,984,00436.37-0.58-1.57
    21 - Oracle (ORCL) - - 13,856,67133.78-0.03-0.09
    22 - AT&T (T) - - 13,736,94833.98-0.25-0.73
    23 - Trina Solar ADS (TSL) - - 13,284,20214.831.9915.50
    24 - Yingli Green Energy Holding ADS (YGE) - - 12,978,3786.730.6310.33
    25 - Petroleo Brasileiro ADS (PBR) - - 12,833,66015.40-0.21-1.35
    26 - United Continental Holdings (UAL) - - 12,603,22530.91-3.16-9.28
    27 - Coca-Cola (KO) - - 12,343,45238.40-0.34-0.88
    28 - Arch Coal (ACI) - - 12,261,1384.25-0.28-6.18
    29 - Morgan Stanley (MS) - - 11,956,34527.08-0.07-0.26
    30 - Pandora Media (P) - - 11,829,96325.520.130.51
    31 - Barrick Gold (ABX) - - 11,775,58518.530.000.00
    32 - Abbott Laboratories (ABT) - - 11,755,71833.14-0.52-1.54
    33 - Banco Santander Brasil ADS (BSBR) - - 11,587,3107.010.467.02
    34 - Advanced Micro Devices (AMD) - - 11,337,6093.86-0.03-0.77
    35 - Annaly Capital Management (NLY) - - 11,004,44011.63-0.07-0.60
    36 - Alpha Natural Resources (ANR) - - 10,941,0746.08-0.19-3.03
    37 - Exxon Mobil (XOM) - - 10,668,11586.90-0.17-0.20
    38 - Itau Unibanco Holding ADS (ITUB) - - 10,638,80314.300.231.63
    39 - Merck&Co (MRK) - - 10,388,15247.790.110.23
    40 - Alcatel-Lucent ADS (ALU) - - 10,181,8333.650.010.27
    41 - Verizon Communications (VZ) - - 10,139,32147.00-0.67-1.41
    42 - Magnum Hunter Resources (MHR) - - 10,004,3036.330.467.84
    43 - Hewlett-Packard (HPQ) - - 9,948,93521.17-0.13-0.61
    44 - PulteGroup (PHM) - - 9,899,14116.57-0.41-2.41
    45 - ReneSola ADS (SOL) - - 9,667,4384.840.398.76
    46 - Corning (GLW) - - 9,547,26514.73-0.21-1.41
    47 - Cole Real Estate Investments (COLE) - - 9,544,02112.210.010.08
    48 - Dow Chemical (DOW) - - 9,150,47939.02-0.97-2.43
    49 - International Game Technology (IGT) - - 9,129,12319.23-1.44-6.97
    50 - Accenture Cl A (ACN) - - 8,773,26074.09-1.78-2.35
    51 - KeyCorp (KEY) - - 8,599,33311.360.020.18
    52 - Bristol-Myers Squibb (BMY) - - 8,440,70946.20-0.73-1.56
    53 - Companhia Siderurgica Nacional ADS (SID) - - 8,437,6364.36-0.05-1.13
    54 - H&R Block (HRB) - - 8,240,98426.360.311.19
    55 - MGIC Investment (MTG) - - 8,135,0377.26-0.10-1.36
    56 - RingCentral Cl A (RNG) - - 8,117,46918.205.2040.00
    57 - United States Steel (X) - - 8,107,89920.44-0.66-3.13
    58 - Cliffs Natural Resources (CLF) - - 8,041,57221.00-0.83-3.80
    59 - Newmont Mining (NEM) - - 8,014,25027.98-0.19-0.67
    60 - Altria Group (MO) - - 7,786,04834.71-0.29-0.83
    61 - SandRidge Energy (SD) - - 7,782,7455.93-0.06-1.00
    62 - Molycorp (MCP) - - 7,735,8316.73-0.45-6.27
    63 - Halliburton (HAL) - - 7,728,73548.39-0.32-0.66
    64 - Taiwan Semiconductor Manufacturing ADS (TSM) - - 7,661,39717.07-0.25-1.44
    65 - Freeport-McMoRan Copper&Gold (FCX) - - 7,622,80333.42-0.45-1.33
    66 - Kodiak Oil&Gas (KOG) - - 7,543,80611.940.161.36
    67 - Xerox (XRX) - - 7,440,68910.37-0.01-0.10
    68 - Sprint (S) - - 7,291,3516.16-0.14-2.22
    69 - Two Harbors Investment (TWO) - - 7,153,8039.790.050.51
    70 - Walter Energy (WLT) - - 7,152,19214.19-0.36-2.47
    71 - International Paper (IP) - - 7,123,72245.44-1.85-3.91
    72 - PPL (PPL) - - 7,026,29230.34-0.13-0.43
    73 - Goldcorp (GG) - - 6,857,44725.760.080.31
    74 - Time Warner (TWX) - - 6,807,23766.201.332.05
    75 - Synovus Financial (SNV) - - 6,764,8053.290.020.61
    76 - AK Steel Holding (AKS) - - 6,662,5993.83-0.11-2.79
    77 - Boston Scientific (BSX) - - 6,629,08411.52-0.15-1.29
    78 - Eldorado Gold (EGO) - - 6,596,9026.65-0.03-0.45
    79 - Newpark Resources (NR) - - 6,552,45312.560.090.72
    80 - AbbVie (ABBV) - - 6,525,52444.33-0.67-1.49
    81 - MBIA (MBI) - - 6,416,58710.38-0.43-3.98
    82 - SAIC (SAI) - - 6,404,58716.030.130.82
    83 - Procter&Gamble (PG) - - 6,389,14377.21-0.84-1.08
    84 - IAMGOLD (IAG) - - 6,293,0014.77-0.06-1.24
    85 - Safeway (SWY) - - 6,268,18432.25-0.29-0.89
    86 - Kinross Gold (KGC) - - 6,112,6584.99-0.03-0.60
    87 - MGM Resorts International (MGM) - - 5,986,14320.22-0.05-0.25
    88 - Cemex ADS (CX) - - 5,907,04011.27-0.06-0.53
    89 - American International Group (AIG) - - 5,900,13349.15-0.30-0.61
    90 - Chesapeake Energy (CHK) - - 5,848,01626.21-0.20-0.76
    91 - RadioShack (RSH) - - 5,837,8333.44-0.43-11.11
    92 - U.S. Bancorp (USB) - - 5,814,37336.50-0.04-0.11
    93 - Eli Lilly (LLY) - - 5,776,99150.50-0.54-1.06
    94 - MetLife (MET) - - 5,774,99647.21-0.37-0.78
    95 - Yamana Gold (AUY) - - 5,742,42610.370.030.29
    96 - CBS Cl B (CBS) - - 5,718,85855.50-0.06-0.11
    97 - CSX (CSX) - - 5,710,06625.85-0.13-0.50
    98 - Carnival (CCL) - - 5,661,32532.88-0.05-0.15
    99 - Mosaic (MOS) - - 5,595,59243.43-0.76-1.72
    100 - Walgreen (WAG) - - 5,568,31054.51-0.22-0.40
    - - -
    - - - - - - - - - - - - - - -
    An Advertising Feature    PARTNER CENTER
    - - - - - - - - - - - - - - -
    - - -
    diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 7a814ce82fd73..b649e394c780b 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -14,7 +14,7 @@ from pandas.errors import ParserError import pandas.util._test_decorators as td -from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv +from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range, read_csv import pandas._testing as tm from pandas.io.common import file_path_to_url @@ -373,32 +373,6 @@ def test_python_docs_table(self): zz = [df.iloc[0, 0][0:4] for df in dfs] assert sorted(zz) == sorted(["Repo", "What"]) - @pytest.mark.slow - def test_thousands_macau_stats(self, datapath): - all_non_nan_table_index = -2 - macau_data = datapath("io", "data", "html", "macau.html") - dfs = self.read_html(macau_data, index_col=0, attrs={"class": "style1"}) - df = dfs[all_non_nan_table_index] - - assert not any(s.isna().any() for _, s in df.items()) - - @pytest.mark.slow - def test_thousands_macau_index_col(self, datapath, request): - # https://github.com/pandas-dev/pandas/issues/29622 - # This tests fails for bs4 >= 4.8.0 - so handle xfail accordingly - if self.read_html.keywords.get("flavor") == "bs4" and td.safe_import( - "bs4", "4.8.0" - ): - reason = "fails for bs4 version >= 4.8.0" - request.node.add_marker(pytest.mark.xfail(reason=reason)) - - all_non_nan_table_index = -2 - macau_data = datapath("io", "data", "html", "macau.html") - dfs = self.read_html(macau_data, index_col=0, header=0) - df = dfs[all_non_nan_table_index] - - assert not any(s.isna().any() for _, s in df.items()) - def test_empty_tables(self): """ Make sure that read_html ignores empty tables. @@ -571,23 +545,6 @@ def test_parse_header_of_non_string_column(self): tm.assert_frame_equal(result, expected) - def test_nyse_wsj_commas_table(self, datapath): - data = datapath("io", "data", "html", "nyse_wsj.html") - df = self.read_html(data, index_col=0, header=0, attrs={"class": "mdcTable"})[0] - - expected = Index( - [ - "Issue(Roll over for charts and headlines)", - "Volume", - "Price", - "Chg", - "% Chg", - ] - ) - nrows = 100 - assert df.shape[0] == nrows - tm.assert_index_equal(df.columns, expected) - @pytest.mark.slow def test_banklist_header(self, datapath): from pandas.io.html import _remove_whitespace @@ -894,24 +851,23 @@ def test_parse_dates_combine(self): newdf = DataFrame({"datetime": raw_dates}) tm.assert_frame_equal(newdf, res[0]) - def test_computer_sales_page(self, datapath): - data = datapath("io", "data", "html", "computer_sales_page.html") - msg = ( - r"Passed header=\[0,1\] are too many " - r"rows for this multi_index of columns" - ) - with pytest.raises(ParserError, match=msg): - self.read_html(data, header=[0, 1]) - - data = datapath("io", "data", "html", "computer_sales_page.html") - assert self.read_html(data, header=[1, 2]) - def test_wikipedia_states_table(self, datapath): data = datapath("io", "data", "html", "wikipedia_states.html") assert os.path.isfile(data), f"{repr(data)} is not a file" assert os.path.getsize(data), f"{repr(data)} is an empty file" result = self.read_html(data, "Arizona", header=1)[0] + assert result.shape == (60, 12) + assert "Unnamed" in result.columns[-1] assert result["sq mi"].dtype == np.dtype("float64") + assert np.allclose(result.loc[0, "sq mi"], 665384.04) + + def test_wikipedia_states_multiindex(self, datapath): + data = datapath("io", "data", "html", "wikipedia_states.html") + result = self.read_html(data, "Arizona", index_col=0)[0] + assert result.shape == (60, 11) + assert "Unnamed" in result.columns[-1][1] + assert result.columns.nlevels == 2 + assert np.allclose(result.loc["Alaska", ("Total area[2]", "sq mi")], 665384.04) def test_parser_error_on_empty_header_row(self): msg = ( From 7ffcf9d6753e7de2c5318e8e0ecdc63586d502f3 Mon Sep 17 00:00:00 2001 From: Jiaxiang Date: Tue, 21 Jan 2020 18:50:46 +0800 Subject: [PATCH 140/158] BUG: concat not copying index and columns when copy=True (#31119) --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/indexes/api.py | 18 +++++++++++++++--- pandas/core/reshape/concat.py | 6 +++++- pandas/tests/reshape/test_concat.py | 14 ++++++++++++++ 4 files changed, 35 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index a04ba157ce0ae..8cbc95f0349cf 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -156,7 +156,7 @@ Reshaping - Bug in :meth:`DataFrame.pivot_table` when ``margin`` is ``True`` and only ``column`` is defined (:issue:`31016`) - Fix incorrect error message in :meth:`DataFrame.pivot` when ``columns`` is set to ``None``. (:issue:`30924`) - Bug in :func:`crosstab` when inputs are two Series and have tuple names, the output will keep dummy MultiIndex as columns. (:issue:`18321`) - +- Bug in :func:`concat` where the resulting indices are not copied when ``copy=True`` (:issue:`29879`) Sparse ^^^^^^ diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 4072d06b9427c..0a23d38ace37e 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -63,7 +63,7 @@ def get_objs_combined_axis( - objs, intersect: bool = False, axis=0, sort: bool = True + objs, intersect: bool = False, axis=0, sort: bool = True, copy: bool = False ) -> Index: """ Extract combined index: return intersection or union (depending on the @@ -81,13 +81,15 @@ def get_objs_combined_axis( The axis to extract indexes from. sort : bool, default True Whether the result index should come out sorted or not. + copy : bool, default False + If True, return a copy of the combined index. Returns ------- Index """ obs_idxes = [obj._get_axis(axis) for obj in objs] - return _get_combined_index(obs_idxes, intersect=intersect, sort=sort) + return _get_combined_index(obs_idxes, intersect=intersect, sort=sort, copy=copy) def _get_distinct_objs(objs: List[Index]) -> List[Index]: @@ -105,7 +107,10 @@ def _get_distinct_objs(objs: List[Index]) -> List[Index]: def _get_combined_index( - indexes: List[Index], intersect: bool = False, sort: bool = False + indexes: List[Index], + intersect: bool = False, + sort: bool = False, + copy: bool = False, ) -> Index: """ Return the union or intersection of indexes. @@ -119,6 +124,8 @@ def _get_combined_index( calculate the union. sort : bool, default False Whether the result index should come out sorted or not. + copy : bool, default False + If True, return a copy of the combined index. Returns ------- @@ -143,6 +150,11 @@ def _get_combined_index( index = index.sort_values() except TypeError: pass + + # GH 29879 + if copy: + index = index.copy() + return index diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 9528de36a3664..b42497b507e1f 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -517,7 +517,11 @@ def _get_new_axes(self) -> List[Index]: def _get_comb_axis(self, i: int) -> Index: data_axis = self.objs[0]._get_block_manager_axis(i) return get_objs_combined_axis( - self.objs, axis=data_axis, intersect=self.intersect, sort=self.sort + self.objs, + axis=data_axis, + intersect=self.intersect, + sort=self.sort, + copy=self.copy, ) def _get_concat_axis(self) -> Index: diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index b3b2c5a05c6ad..5811f3bc196a1 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2750,3 +2750,17 @@ def test_concat_sparse(): ) result = pd.concat([a, a], axis=1) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("test_series", [True, False]) +def test_concat_copy_index(test_series, axis): + # GH 29879 + if test_series: + ser = Series([1, 2]) + comb = concat([ser, ser], axis=axis, copy=True) + assert comb.index is not ser.index + else: + df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) + comb = concat([df, df], axis=axis, copy=True) + assert comb.index is not df.index + assert comb.columns is not df.columns From c0046b6f55c68ffc9d63bf8624fc1e926a4b00df Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Tue, 21 Jan 2020 23:11:18 +0800 Subject: [PATCH 141/158] BUG: updated rolling and expanding count for consistency (GH26996) Updated the behavior of rolling and expanding count so that it becomes consistent with all other rolling and expanding functions. Also updated many test cases to reflect this change of behavior. --- pandas/core/window/rolling.py | 17 +++--- .../window/moments/test_moments_expanding.py | 6 +-- .../window/moments/test_moments_rolling.py | 52 +++++++++++++------ pandas/tests/window/test_api.py | 4 +- pandas/tests/window/test_dtypes.py | 8 +-- pandas/tests/window/test_rolling.py | 4 +- 6 files changed, 55 insertions(+), 36 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index c79394a79974b..33a9405786050 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1186,13 +1186,10 @@ def count(self): window = self._get_window() window = min(window, len(obj)) if not self.center else window - # We set the default value min_periods to be 0 because count method - # is meant to count NAs, we don't want it by default requires all - # values in the window to be valid to produce a valid count - min_periods = 0 if self.min_periods is None else self.min_periods - - # this is required as window is mutate above - min_periods = min(min_periods, window) + min_periods = self.min_periods + if min_periods is not None and not self.center: + # this is required as window is mutated above + min_periods = min(min_periods, window) results = [] for b in blocks: @@ -1665,7 +1662,11 @@ def _get_cov(X, Y): mean = lambda x: x.rolling( window, self.min_periods, center=self.center ).mean(**kwargs) - count = (X + Y).rolling(window=window, center=self.center).count(**kwargs) + count = ( + (X + Y) + .rolling(window=window, min_periods=0, center=self.center) + .count(**kwargs) + ) bias_adj = count / (count - ddof) return (mean(X * Y) - mean(X) * mean(Y)) * bias_adj diff --git a/pandas/tests/window/moments/test_moments_expanding.py b/pandas/tests/window/moments/test_moments_expanding.py index 322082187f531..9dfaecee9caeb 100644 --- a/pandas/tests/window/moments/test_moments_expanding.py +++ b/pandas/tests/window/moments/test_moments_expanding.py @@ -40,9 +40,9 @@ def test_expanding_corr(self): tm.assert_almost_equal(rolling_result, result) def test_expanding_count(self): - result = self.series.expanding().count() + result = self.series.expanding(min_periods=0).count() tm.assert_almost_equal( - result, self.series.rolling(window=len(self.series)).count() + result, self.series.rolling(window=len(self.series), min_periods=0).count() ) def test_expanding_quantile(self): @@ -369,7 +369,7 @@ def test_expanding_consistency(self, min_periods): ) self._test_moments_consistency( min_periods=min_periods, - count=lambda x: x.expanding().count(), + count=lambda x: x.expanding(min_periods=min_periods).count(), mean=lambda x: x.expanding(min_periods=min_periods).mean(), corr=lambda x, y: x.expanding(min_periods=min_periods).corr(y), var_unbiased=lambda x: x.expanding(min_periods=min_periods).var(), diff --git a/pandas/tests/window/moments/test_moments_rolling.py b/pandas/tests/window/moments/test_moments_rolling.py index 9acb4ffcb40b8..2de3c25b6d78e 100644 --- a/pandas/tests/window/moments/test_moments_rolling.py +++ b/pandas/tests/window/moments/test_moments_rolling.py @@ -777,8 +777,8 @@ def get_result(obj, window, min_periods=None, center=False): series_result = get_result(series, window=win, min_periods=minp) frame_result = get_result(frame, window=win, min_periods=minp) else: - series_result = get_result(series, window=win) - frame_result = get_result(frame, window=win) + series_result = get_result(series, window=win, min_periods=0) + frame_result = get_result(frame, window=win, min_periods=0) last_date = series_result.index[-1] prev_date = last_date - 24 * offsets.BDay() @@ -851,10 +851,11 @@ def get_result(obj, window, min_periods=None, center=False): pd.concat([obj, Series([np.NaN] * 9)]), 20, min_periods=15 )[9:].reset_index(drop=True) else: - result = get_result(obj, 20, center=True) - expected = get_result(pd.concat([obj, Series([np.NaN] * 9)]), 20)[ - 9: - ].reset_index(drop=True) + result = get_result(obj, 20, min_periods=0, center=True) + print(result) + expected = get_result( + pd.concat([obj, Series([np.NaN] * 9)]), 20, min_periods=0 + )[9:].reset_index(drop=True) tm.assert_series_equal(result, expected) @@ -893,21 +894,27 @@ def get_result(obj, window, min_periods=None, center=False): else: series_xp = ( get_result( - self.series.reindex(list(self.series.index) + s), window=25 + self.series.reindex(list(self.series.index) + s), + window=25, + min_periods=0, ) .shift(-12) .reindex(self.series.index) ) frame_xp = ( get_result( - self.frame.reindex(list(self.frame.index) + s), window=25 + self.frame.reindex(list(self.frame.index) + s), + window=25, + min_periods=0, ) .shift(-12) .reindex(self.frame.index) ) - series_rs = get_result(self.series, window=25, center=True) - frame_rs = get_result(self.frame, window=25, center=True) + series_rs = get_result( + self.series, window=25, min_periods=0, center=True + ) + frame_rs = get_result(self.frame, window=25, min_periods=0, center=True) if fill_value is not None: series_xp = series_xp.fillna(fill_value) @@ -964,7 +971,11 @@ def test_rolling_consistency(self, window, min_periods, center): self._test_moments_consistency_is_constant( min_periods=min_periods, - count=lambda x: (x.rolling(window=window, center=center).count()), + count=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).count() + ), mean=lambda x: ( x.rolling( window=window, min_periods=min_periods, center=center @@ -989,19 +1000,26 @@ def test_rolling_consistency(self, window, min_periods, center): ).var(ddof=0) ), var_debiasing_factors=lambda x: ( - x.rolling(window=window, center=center) + x.rolling(window=window, min_periods=min_periods, center=center) .count() .divide( - (x.rolling(window=window, center=center).count() - 1.0).replace( - 0.0, np.nan - ) + ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).count() + - 1.0 + ).replace(0.0, np.nan) ) ), ) self._test_moments_consistency( min_periods=min_periods, - count=lambda x: (x.rolling(window=window, center=center).count()), + count=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).count() + ), mean=lambda x: ( x.rolling( window=window, min_periods=min_periods, center=center @@ -1071,7 +1089,7 @@ def test_rolling_consistency(self, window, min_periods, center): if name == "count": rolling_f_result = rolling_f() rolling_apply_f_result = x.rolling( - window=window, min_periods=0, center=center + window=window, min_periods=min_periods, center=center ).apply(func=f, raw=True) else: if name in ["cov", "corr"]: diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 5e70e13209de5..680237db0535b 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -237,10 +237,10 @@ def test_count_nonnumeric_types(self): columns=cols, ) - result = df.rolling(window=2).count() + result = df.rolling(window=2, min_periods=0).count() tm.assert_frame_equal(result, expected) - result = df.rolling(1).count() + result = df.rolling(1, min_periods=0).count() expected = df.notna().astype(float) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_dtypes.py b/pandas/tests/window/test_dtypes.py index b1c9b66ab09d3..35f93b1262f59 100644 --- a/pandas/tests/window/test_dtypes.py +++ b/pandas/tests/window/test_dtypes.py @@ -34,7 +34,7 @@ class Dtype: def get_expects(self): expects = { "sr1": { - "count": Series([1, 2, 2, 2, 2], dtype="float64"), + "count": Series([np.nan, 2, 2, 2, 2], dtype="float64"), "max": Series([np.nan, 1, 2, 3, 4], dtype="float64"), "min": Series([np.nan, 0, 1, 2, 3], dtype="float64"), "sum": Series([np.nan, 1, 3, 5, 7], dtype="float64"), @@ -44,7 +44,7 @@ def get_expects(self): "median": Series([np.nan, 0.5, 1.5, 2.5, 3.5], dtype="float64"), }, "sr2": { - "count": Series([1, 2, 2, 2, 2], dtype="float64"), + "count": Series([np.nan, 2, 2, 2, 2], dtype="float64"), "max": Series([np.nan, 10, 8, 6, 4], dtype="float64"), "min": Series([np.nan, 8, 6, 4, 2], dtype="float64"), "sum": Series([np.nan, 18, 14, 10, 6], dtype="float64"), @@ -54,7 +54,7 @@ def get_expects(self): "median": Series([np.nan, 9, 7, 5, 3], dtype="float64"), }, "sr3": { - "count": Series([1, 2, 2, 1, 1], dtype="float64"), + "count": Series([np.nan, 2, 2, 1, 1], dtype="float64"), "max": Series([np.nan, 1, 2, np.nan, np.nan], dtype="float64"), "min": Series([np.nan, 0, 1, np.nan, np.nan], dtype="float64"), "sum": Series([np.nan, 1, 3, np.nan, np.nan], dtype="float64"), @@ -67,7 +67,7 @@ def get_expects(self): }, "df": { "count": DataFrame( - {0: Series([1, 2, 2, 2, 2]), 1: Series([1, 2, 2, 2, 2])}, + {0: Series([np.nan, 2, 2, 2, 2]), 1: Series([np.nan, 2, 2, 2, 2])}, dtype="float64", ), "max": DataFrame( diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index b1ba19e4b7688..47429741164f3 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -344,7 +344,7 @@ def test_rolling_axis_count(self, axis_frame): else: expected = DataFrame({"x": [1.0, 1.0, 1.0], "y": [2.0, 2.0, 2.0]}) - result = df.rolling(2, axis=axis_frame).count() + result = df.rolling(2, axis=axis_frame, min_periods=0).count() tm.assert_frame_equal(result, expected) def test_readonly_array(self): @@ -469,7 +469,7 @@ def test_rolling_count_default_min_periods_with_null_values(test_series): # we want to by default produce a valid count even if # there are very few valid entries in the window values = [1, 2, 3, np.nan, 4, 5, 6] - expected_counts = [1.0, 2.0, 3.0, 2.0, 2.0, 2.0, 3.0] + expected_counts = [np.nan, np.nan, 3.0, 2.0, 2.0, 2.0, 3.0] if test_series: ser = Series(values) From 4050e4c807e39bf3f4bdcb5651498a5b4a823faf Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 21 Jan 2020 08:27:24 -0800 Subject: [PATCH 142/158] ENH: partial string indexing on non-monotonic PeriodIndex (#31096) --- doc/source/user_guide/timeseries.rst | 5 ++ doc/source/whatsnew/v1.1.0.rst | 21 ++++++++ pandas/core/indexes/period.py | 46 +++++++++++----- .../indexes/period/test_partial_slicing.py | 53 +++++++++++++++++-- 4 files changed, 108 insertions(+), 17 deletions(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 08b2ae0a4a837..3fdab0fd26643 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1951,6 +1951,10 @@ The ``period`` dtype can be used in ``.astype(...)``. It allows one to change th PeriodIndex partial string indexing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +PeriodIndex now supports partial string slicing with non-monotonic indexes. + +.. versionadded:: 1.1.0 + You can pass in dates and strings to ``Series`` and ``DataFrame`` with ``PeriodIndex``, in the same manner as ``DatetimeIndex``. For details, refer to :ref:`DatetimeIndex Partial String Indexing `. .. ipython:: python @@ -1981,6 +1985,7 @@ As with ``DatetimeIndex``, the endpoints will be included in the result. The exa dfp['2013-01-01 10H':'2013-01-01 11H'] + Frequency conversion and resampling with PeriodIndex ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The frequency of ``Period`` and ``PeriodIndex`` can be converted via the ``asfreq`` diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 8cbc95f0349cf..59c90534beefd 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -13,6 +13,27 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +.. _whatsnew_110.period_index_partial_string_slicing: + +Nonmonotonic PeriodIndex Partial String Slicing +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:class:`PeriodIndex` now supports partial string slicing for non-monotonic indexes, mirroring :class:`DatetimeIndex` behavior (:issue:`31096`) + +For example: + +.. ipython:: python + + dti = pd.date_range("2014-01-01", periods=30, freq="30D") + pi = dti.to_period("D") + ser_monotonic = pd.Series(np.arange(30), index=pi) + shuffler = list(range(0, 30, 2)) + list(range(1, 31, 2)) + ser = ser_monotonic[shuffler] + ser + +.. ipython:: python + ser["2014"] + ser.loc["May 2015"] + .. _whatsnew_110.enhancements.other: Other enhancements diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index b3386f6104032..2a40f4a6f6239 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -567,6 +567,11 @@ def get_loc(self, key, method=None, tolerance=None): """ if isinstance(key, str): + try: + return self._get_string_slice(key) + except (TypeError, KeyError, ValueError, OverflowError): + pass + try: asdt, reso = parse_time_string(key, self.freq) key = asdt @@ -648,10 +653,6 @@ def _parsed_string_to_bounds(self, reso: str, parsed: datetime): def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True): # TODO: Check for non-True use_lhs/use_rhs - raw = key - if not self.is_monotonic: - raise ValueError("Partial indexing only valid for ordered time series") - parsed, reso = parse_time_string(key, self.freq) grp = resolution.Resolution.get_freq_group(reso) freqn = resolution.get_freq_group(self.freq) @@ -660,18 +661,35 @@ def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True # TODO: we used to also check for # reso in ["day", "hour", "minute", "second"] # why is that check not needed? - raise TypeError(key) + raise ValueError(key) t1, t2 = self._parsed_string_to_bounds(reso, parsed) - if len(self): - if t2 < self.min() or t1 > self.max(): - raise KeyError(raw) - - # Use asi8 searchsorted to avoid overhead of re-validating inputs - return slice( - self.asi8.searchsorted(t1.ordinal, side="left"), - self.asi8.searchsorted(t2.ordinal, side="right"), - ) + i8vals = self.asi8 + + if self.is_monotonic: + + # we are out of range + if len(self) and ( + (use_lhs and t1 < self[0] and t2 < self[0]) + or ((use_rhs and t1 > self[-1] and t2 > self[-1])) + ): + raise KeyError(key) + + # TODO: does this depend on being monotonic _increasing_? + # If so, DTI will also be affected. + + # a monotonic (sorted) series can be sliced + # Use asi8.searchsorted to avoid re-validating Periods + left = i8vals.searchsorted(t1.ordinal, side="left") if use_lhs else None + right = i8vals.searchsorted(t2.ordinal, side="right") if use_rhs else None + return slice(left, right) + + else: + lhs_mask = (i8vals >= t1.ordinal) if use_lhs else True + rhs_mask = (i8vals <= t2.ordinal) if use_rhs else True + + # try to find a the dates + return (lhs_mask & rhs_mask).nonzero()[0] def _convert_tolerance(self, tolerance, target): tolerance = DatetimeIndexOpsMixin._convert_tolerance(self, tolerance, target) diff --git a/pandas/tests/indexes/period/test_partial_slicing.py b/pandas/tests/indexes/period/test_partial_slicing.py index 9ca2dd169416f..833901ea7ba22 100644 --- a/pandas/tests/indexes/period/test_partial_slicing.py +++ b/pandas/tests/indexes/period/test_partial_slicing.py @@ -7,9 +7,6 @@ class TestPeriodIndex: - def setup_method(self, method): - pass - def test_slice_with_negative_step(self): ts = Series(np.arange(20), period_range("2014-01", periods=20, freq="M")) SLC = pd.IndexSlice @@ -133,3 +130,53 @@ def test_range_slice_outofbounds(self): tm.assert_frame_equal(df["2013/10/15":"2013/10/17"], empty) tm.assert_frame_equal(df["2013-06":"2013-09"], empty) tm.assert_frame_equal(df["2013-11":"2013-12"], empty) + + def test_partial_slice_doesnt_require_monotonicity(self): + # See also: DatetimeIndex test ofm the same name + dti = pd.date_range("2014-01-01", periods=30, freq="30D") + pi = dti.to_period("D") + + ser_montonic = pd.Series(np.arange(30), index=pi) + + shuffler = list(range(0, 30, 2)) + list(range(1, 31, 2)) + ser = ser_montonic[shuffler] + nidx = ser.index + + # Manually identified locations of year==2014 + indexer_2014 = np.array( + [0, 1, 2, 3, 4, 5, 6, 15, 16, 17, 18, 19, 20], dtype=np.intp + ) + assert (nidx[indexer_2014].year == 2014).all() + assert not (nidx[~indexer_2014].year == 2014).any() + + result = nidx.get_loc("2014") + tm.assert_numpy_array_equal(result, indexer_2014) + + expected = ser[indexer_2014] + + result = nidx.get_value(ser, "2014") + tm.assert_series_equal(result, expected) + + result = ser.loc["2014"] + tm.assert_series_equal(result, expected) + + result = ser["2014"] + tm.assert_series_equal(result, expected) + + # Manually identified locations where ser.index is within Mat 2015 + indexer_may2015 = np.array([23], dtype=np.intp) + assert nidx[23].year == 2015 and nidx[23].month == 5 + + result = nidx.get_loc("May 2015") + tm.assert_numpy_array_equal(result, indexer_may2015) + + expected = ser[indexer_may2015] + + result = nidx.get_value(ser, "May 2015") + tm.assert_series_equal(result, expected) + + result = ser.loc["May 2015"] + tm.assert_series_equal(result, expected) + + result = ser["May 2015"] + tm.assert_series_equal(result, expected) From d77b1d800d95708ec10e573165468f9e9d6a4ace Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 21 Jan 2020 10:26:44 -0800 Subject: [PATCH 143/158] TST: Add more regression tests for fixed issues (#31171) * TST: Add more regression tests for fixed issues * Fix lint and platform compat * Using intp * Move interval indexing test to appropriate location --- pandas/tests/frame/test_constructors.py | 6 +++ .../tests/groupby/aggregate/test_aggregate.py | 16 ++++++++ .../tests/indexes/interval/test_indexing.py | 12 ++++++ .../tests/indexing/multiindex/test_getitem.py | 10 +++++ pandas/tests/indexing/multiindex/test_loc.py | 19 +++++++++ .../tests/resample/test_resampler_grouper.py | 17 ++++++++ pandas/tests/reshape/test_pivot.py | 40 +++++++++++++++++++ 7 files changed, 120 insertions(+) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index b1620df91ba26..7b1a9d8ff6ae3 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -412,6 +412,12 @@ def test_constructor_dict_order_insertion(self): expected = DataFrame(data=d, columns=list("ba")) tm.assert_frame_equal(frame, expected) + def test_constructor_dict_nan_key_and_columns(self): + # GH 16894 + result = pd.DataFrame({np.nan: [1, 2], 2: [2, 3]}, columns=[np.nan, 2]) + expected = pd.DataFrame([[1, 2], [2, 3]], columns=[np.nan, 2]) + tm.assert_frame_equal(result, expected) + def test_constructor_multi_index(self): # GH 4078 # construction error with mi and all-nan frame diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 3d842aca210ed..0a7272bbc131c 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -630,6 +630,22 @@ def test_lambda_named_agg(func): tm.assert_frame_equal(result, expected) +def test_aggregate_mixed_types(): + # GH 16916 + df = pd.DataFrame( + data=np.array([0] * 9).reshape(3, 3), columns=list("XYZ"), index=list("abc") + ) + df["grouping"] = ["group 1", "group 1", 2] + result = df.groupby("grouping").aggregate(lambda x: x.tolist()) + expected_data = [[[0], [0], [0]], [[0, 0], [0, 0], [0, 0]]] + expected = pd.DataFrame( + expected_data, + index=Index([2, "group 1"], dtype="object", name="grouping"), + columns=Index(["X", "Y", "Z"], dtype="object"), + ) + tm.assert_frame_equal(result, expected) + + class TestLambdaMangling: def test_basic(self): df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index 1bfc58733a110..87b72f702e2aa 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -312,6 +312,18 @@ def test_get_indexer_non_unique_with_int_and_float(self, query, expected): # TODO we may also want to test get_indexer for the case when # the intervals are duplicated, decreasing, non-monotonic, etc.. + def test_get_indexer_non_monotonic(self): + # GH 16410 + idx1 = IntervalIndex.from_tuples([(2, 3), (4, 5), (0, 1)]) + idx2 = IntervalIndex.from_tuples([(0, 1), (2, 3), (6, 7), (8, 9)]) + result = idx1.get_indexer(idx2) + expected = np.array([2, 0, -1, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + result = idx1.get_indexer(idx1[1:]) + expected = np.array([1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + class TestSliceLocs: def test_slice_locs_with_interval(self): diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index 8ea825da8f94f..c15fa34283f21 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -250,3 +250,13 @@ def test_frame_mi_access_returns_frame(dataframe_with_duplicate_index): ).T result = df["A"]["B2"] tm.assert_frame_equal(result, expected) + + +def test_frame_mi_empty_slice(): + # GH 15454 + df = DataFrame(0, index=range(2), columns=MultiIndex.from_product([[1], [2]])) + result = df[[]] + expected = DataFrame( + index=[0, 1], columns=MultiIndex(levels=[[1], [2]], codes=[[], []]) + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index 3b8aa963ac698..b7802d9b8fe0c 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -468,3 +468,22 @@ def test_loc_period_string_indexing(): ), ) tm.assert_series_equal(result, expected) + + +def test_loc_datetime_mask_slicing(): + # GH 16699 + dt_idx = pd.to_datetime(["2017-05-04", "2017-05-05"]) + m_idx = pd.MultiIndex.from_product([dt_idx, dt_idx], names=["Idx1", "Idx2"]) + df = pd.DataFrame( + data=[[1, 2], [3, 4], [5, 6], [7, 6]], index=m_idx, columns=["C1", "C2"] + ) + result = df.loc[(dt_idx[0], (df.index.get_level_values(1) > "2017-05-04")), "C1"] + expected = pd.Series( + [3], + name="C1", + index=MultiIndex.from_tuples( + [(pd.Timestamp("2017-05-04"), pd.Timestamp("2017-05-05"))], + names=["Idx1", "Idx2"], + ), + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 4e3585c0be884..03c1445e099a0 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -230,6 +230,23 @@ def f(x): tm.assert_series_equal(result, expected) +def test_apply_columns_multilevel(): + # GH 16231 + cols = pd.MultiIndex.from_tuples([("A", "a", "", "one"), ("B", "b", "i", "two")]) + ind = date_range(start="2017-01-01", freq="15Min", periods=8) + df = DataFrame(np.array([0] * 16).reshape(8, 2), index=ind, columns=cols) + agg_dict = {col: (np.sum if col[3] == "one" else np.mean) for col in df.columns} + result = df.resample("H").apply(lambda x: agg_dict[x.name](x)) + expected = DataFrame( + np.array([0] * 4).reshape(2, 2), + index=date_range(start="2017-01-01", freq="1H", periods=2), + columns=pd.MultiIndex.from_tuples( + [("A", "a", "", "one"), ("B", "b", "i", "two")] + ), + ) + tm.assert_frame_equal(result, expected) + + def test_resample_groupby_with_label(): # GH 13235 index = date_range("2000-01-01", freq="2D", periods=5) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 6850c52ca05ea..fe75aef1ca3d7 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2649,6 +2649,46 @@ def test_crosstab_unsorted_order(self): ) tm.assert_frame_equal(result, expected) + def test_crosstab_normalize_multiple_columns(self): + # GH 15150 + df = pd.DataFrame( + { + "A": ["one", "one", "two", "three"] * 6, + "B": ["A", "B", "C"] * 8, + "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, + "D": [0] * 24, + "E": [0] * 24, + } + ) + result = pd.crosstab( + [df.A, df.B], + df.C, + values=df.D, + aggfunc=np.sum, + normalize=True, + margins=True, + ) + expected = pd.DataFrame( + np.array([0] * 29 + [1], dtype=float).reshape(10, 3), + columns=Index(["bar", "foo", "All"], dtype="object", name="C"), + index=MultiIndex.from_tuples( + [ + ("one", "A"), + ("one", "B"), + ("one", "C"), + ("three", "A"), + ("three", "B"), + ("three", "C"), + ("two", "A"), + ("two", "B"), + ("two", "C"), + ("All", ""), + ], + names=["A", "B"], + ), + ) + tm.assert_frame_equal(result, expected) + def test_margin_normalize(self): # GH 27500 df = pd.DataFrame( From e31c5ad5464e9f6a72bc9f99dfa7a4b095f9ca5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nico=20Schl=C3=B6mer?= Date: Tue, 21 Jan 2020 22:11:55 +0100 Subject: [PATCH 144/158] COMPAT: numpy test warnings (#30345) --- pandas/tests/arrays/test_integer.py | 2 ++ pandas/tests/plotting/test_datetimelike.py | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index f1a7cc741603d..9cb1e4176df96 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -935,6 +935,8 @@ def test_astype_nansafe(): @pytest.mark.parametrize("ufunc", [np.abs, np.sign]) +# np.sign emits a warning with nans, +@pytest.mark.filterwarnings("ignore:invalid value encountered in sign") def test_ufuncs_single_int(ufunc): a = integer_array([1, 2, -3, np.nan]) result = ufunc(a) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index fb86b600d3d3c..84d298cd7c6fe 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -43,7 +43,13 @@ def setup_method(self, method): def teardown_method(self, method): tm.close() + # Ignore warning + # ``` + # Converting to PeriodArray/Index representation will drop timezone information. + # ``` + # which occurs for UTC-like timezones. @pytest.mark.slow + @pytest.mark.filterwarnings("ignore:msg:UserWarning") def test_ts_plot_with_tz(self, tz_aware_fixture): # GH2877, GH17173 tz = tz_aware_fixture From c3f492f5868f71290bbd3685531903bb79e0e493 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 21 Jan 2020 16:55:52 -0800 Subject: [PATCH 145/158] REF: move misplaced tests (#31189) --- pandas/tests/indexes/datetimes/test_snap.py | 37 ++++++++++++++++ pandas/tests/indexes/multi/test_setops.py | 11 +++++ .../{test_set_ops.py => test_insert.py} | 12 +----- pandas/tests/series/indexing/test_datetime.py | 43 ------------------- pandas/tests/series/methods/test_round.py | 10 +++++ 5 files changed, 59 insertions(+), 54 deletions(-) create mode 100644 pandas/tests/indexes/datetimes/test_snap.py rename pandas/tests/indexing/multiindex/{test_set_ops.py => test_insert.py} (72%) diff --git a/pandas/tests/indexes/datetimes/test_snap.py b/pandas/tests/indexes/datetimes/test_snap.py new file mode 100644 index 0000000000000..a21d27d23f6b5 --- /dev/null +++ b/pandas/tests/indexes/datetimes/test_snap.py @@ -0,0 +1,37 @@ +import pytest + +from pandas import DatetimeIndex, date_range +import pandas._testing as tm + + +@pytest.mark.filterwarnings("ignore::DeprecationWarning") +@pytest.mark.parametrize("tz", [None, "Asia/Shanghai", "Europe/Berlin"]) +@pytest.mark.parametrize("name", [None, "my_dti"]) +def test_dti_snap(name, tz): + dti = DatetimeIndex( + [ + "1/1/2002", + "1/2/2002", + "1/3/2002", + "1/4/2002", + "1/5/2002", + "1/6/2002", + "1/7/2002", + ], + name=name, + tz=tz, + freq="D", + ) + + result = dti.snap(freq="W-MON") + expected = date_range("12/31/2001", "1/7/2002", name=name, tz=tz, freq="w-mon") + expected = expected.repeat([3, 4]) + tm.assert_index_equal(result, expected) + assert result.tz == expected.tz + + result = dti.snap(freq="B") + + expected = date_range("1/1/2002", "1/7/2002", name=name, tz=tz, freq="b") + expected = expected.repeat([1, 1, 1, 2, 2]) + tm.assert_index_equal(result, expected) + assert result.tz == expected.tz diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 841e3b3f17b38..f949db537de67 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -110,6 +110,17 @@ def test_symmetric_difference(idx, sort): first.symmetric_difference([1, 2, 3], sort=sort) +def test_multiindex_symmetric_difference(): + # GH 13490 + idx = MultiIndex.from_product([["a", "b"], ["A", "B"]], names=["a", "b"]) + result = idx ^ idx + assert result.names == idx.names + + idx2 = idx.copy().rename(["A", "B"]) + result = idx ^ idx2 + assert result.names == [None, None] + + def test_empty(idx): # GH 15270 assert not idx.empty diff --git a/pandas/tests/indexing/multiindex/test_set_ops.py b/pandas/tests/indexing/multiindex/test_insert.py similarity index 72% rename from pandas/tests/indexing/multiindex/test_set_ops.py rename to pandas/tests/indexing/multiindex/test_insert.py index f2cbfadb3cfa5..835e61da2fb3e 100644 --- a/pandas/tests/indexing/multiindex/test_set_ops.py +++ b/pandas/tests/indexing/multiindex/test_insert.py @@ -4,17 +4,7 @@ import pandas._testing as tm -class TestMultiIndexSetOps: - def test_multiindex_symmetric_difference(self): - # GH 13490 - idx = MultiIndex.from_product([["a", "b"], ["A", "B"]], names=["a", "b"]) - result = idx ^ idx - assert result.names == idx.names - - idx2 = idx.copy().rename(["A", "B"]) - result = idx ^ idx2 - assert result.names == [None, None] - +class TestMultiIndexInsertion: def test_mixed_depth_insert(self): arrays = [ ["a", "top", "top", "routine1", "routine1", "routine2"], diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 15ff5f6b343d1..77085ef547690 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -49,39 +49,6 @@ def test_fancy_setitem(): assert (s[48:54] == -3).all() -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -@pytest.mark.parametrize("tz", [None, "Asia/Shanghai", "Europe/Berlin"]) -@pytest.mark.parametrize("name", [None, "my_dti"]) -def test_dti_snap(name, tz): - dti = DatetimeIndex( - [ - "1/1/2002", - "1/2/2002", - "1/3/2002", - "1/4/2002", - "1/5/2002", - "1/6/2002", - "1/7/2002", - ], - name=name, - tz=tz, - freq="D", - ) - - result = dti.snap(freq="W-MON") - expected = date_range("12/31/2001", "1/7/2002", name=name, tz=tz, freq="w-mon") - expected = expected.repeat([3, 4]) - tm.assert_index_equal(result, expected) - assert result.tz == expected.tz - - result = dti.snap(freq="B") - - expected = date_range("1/1/2002", "1/7/2002", name=name, tz=tz, freq="b") - expected = expected.repeat([1, 1, 1, 2, 2]) - tm.assert_index_equal(result, expected) - assert result.tz == expected.tz - - def test_dti_reset_index_round_trip(): dti = date_range(start="1/1/2001", end="6/1/2001", freq="D") d1 = DataFrame({"v": np.random.rand(len(dti))}, index=dti) @@ -751,16 +718,6 @@ def test_nat_operations(): assert s.max() == exp -@pytest.mark.parametrize("method", ["round", "floor", "ceil"]) -@pytest.mark.parametrize("freq", ["s", "5s", "min", "5min", "h", "5h"]) -def test_round_nat(method, freq): - # GH14940 - s = Series([pd.NaT]) - expected = Series(pd.NaT) - round_method = getattr(s.dt, method) - tm.assert_series_equal(round_method(freq), expected) - - def test_setitem_tuple_with_datetimetz(): # GH 20441 arr = date_range("2017", periods=4, tz="US/Eastern") diff --git a/pandas/tests/series/methods/test_round.py b/pandas/tests/series/methods/test_round.py index 7f0711a0f30d7..88d5c428712dc 100644 --- a/pandas/tests/series/methods/test_round.py +++ b/pandas/tests/series/methods/test_round.py @@ -1,6 +1,7 @@ import numpy as np import pytest +import pandas as pd from pandas import Series import pandas._testing as tm @@ -44,3 +45,12 @@ def test_round_builtin(self): expected_rounded = Series([1.12, 2.12, 3.12], index=range(3)) result = round(ser, decimals) tm.assert_series_equal(result, expected_rounded) + + @pytest.mark.parametrize("method", ["round", "floor", "ceil"]) + @pytest.mark.parametrize("freq", ["s", "5s", "min", "5min", "h", "5h"]) + def test_round_nat(self, method, freq): + # GH14940 + ser = Series([pd.NaT]) + expected = Series(pd.NaT) + round_method = getattr(ser.dt, method) + tm.assert_series_equal(round_method(freq), expected) From 6dea55786e3f95055b65934e8d4a61bada54ea2d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 22 Jan 2020 00:15:06 -0800 Subject: [PATCH 146/158] CLN: remove unused fixtures (#31192) --- pandas/tests/indexes/conftest.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index e3e7ff4093b76..b1dcf0ed9b44b 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -1,7 +1,5 @@ -import numpy as np import pytest -import pandas as pd import pandas._testing as tm from pandas.core.indexes.api import Index, MultiIndex @@ -28,25 +26,3 @@ def indices(request): # copy to avoid mutation, e.g. setting .name return indices_dict[request.param].copy() - - -@pytest.fixture(params=[1, np.array(1, dtype=np.int64)]) -def one(request): - # zero-dim integer array behaves like an integer - return request.param - - -zeros = [ - box([0] * 5, dtype=dtype) - for box in [pd.Index, np.array] - for dtype in [np.int64, np.uint64, np.float64] -] -zeros.extend([np.array(0, dtype=dtype) for dtype in [np.int64, np.uint64, np.float64]]) -zeros.extend([0, 0.0]) - - -@pytest.fixture(params=zeros) -def zero(request): - # For testing division by (or of) zero for Index with length 5, this - # gives several scalar-zeros and length-5 vector-zeros - return request.param From f4c99ff91469ac345de42115e144f65d604f88aa Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 22 Jan 2020 06:16:45 -0800 Subject: [PATCH 147/158] REG: restore format_type attr (#31017) * REG: restore format_type attr --- pandas/io/pytables.py | 2 ++ pandas/tests/io/pytables/test_store.py | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 3e4673c890bef..c1e12887b0150 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2472,6 +2472,7 @@ class Fixed: """ pandas_kind: str + format_type: str = "fixed" # GH#30962 needed by dask obj_type: Type[Union[DataFrame, Series]] ndim: int encoding: str @@ -3129,6 +3130,7 @@ class Table(Fixed): """ pandas_kind = "wide_table" + format_type: str = "table" # GH#30962 needed by dask table_type: str levels = 1 is_table = True diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 64c4ad800f49d..f56d042093886 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -64,6 +64,16 @@ @pytest.mark.single class TestHDFStore: + def test_format_type(self, setup_path): + df = pd.DataFrame({"A": [1, 2]}) + with ensure_clean_path(setup_path) as path: + with HDFStore(path) as store: + store.put("a", df, format="fixed") + store.put("b", df, format="table") + + assert store.get_storer("a").format_type == "fixed" + assert store.get_storer("b").format_type == "table" + def test_format_kwarg_in_constructor(self, setup_path): # GH 13291 From 67528335473d4e1f4bb7b0df78d66f69b1a7507e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 22 Jan 2020 08:46:58 -0600 Subject: [PATCH 148/158] CI: numpydev ragged array dtype warning (#31203) --- pandas/core/indexes/multi.py | 2 +- pandas/core/strings.py | 2 +- pandas/tests/arrays/categorical/test_constructors.py | 2 +- pandas/tests/arrays/categorical/test_missing.py | 2 +- pandas/tests/extension/base/getitem.py | 10 +++++++--- pandas/tests/extension/json/array.py | 5 +++++ pandas/tests/extension/json/test_json.py | 4 ---- 7 files changed, 16 insertions(+), 11 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b684908c25fe5..a26a01ab7be21 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2058,7 +2058,7 @@ def drop(self, codes, level=None, errors="raise"): if not isinstance(codes, (np.ndarray, Index)): try: - codes = com.index_labels_to_array(codes) + codes = com.index_labels_to_array(codes, dtype=object) except ValueError: pass diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 4bcf2943e3d6e..18c7504f2c2f8 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -79,7 +79,7 @@ def cat_core(list_of_columns: List, sep: str): return np.sum(arr_of_cols, axis=0) list_with_sep = [sep] * (2 * len(list_of_columns) - 1) list_with_sep[::2] = list_of_columns - arr_with_sep = np.asarray(list_with_sep) + arr_with_sep = np.asarray(list_with_sep, dtype=object) return np.sum(arr_with_sep, axis=0) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 70a23e9748dd1..cfba3da354d44 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -605,6 +605,6 @@ def test_constructor_imaginary(self): @pytest.mark.skipif(_np_version_under1p16, reason="Skipping for NumPy <1.16") def test_constructor_string_and_tuples(self): # GH 21416 - c = pd.Categorical(["c", ("a", "b"), ("b", "a"), "c"]) + c = pd.Categorical(np.array(["c", ("a", "b"), ("b", "a"), "c"], dtype=object)) expected_index = pd.Index([("a", "b"), ("b", "a"), "c"]) assert c.categories.equals(expected_index) diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 211bf091ee17d..8889f45a84237 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -77,7 +77,7 @@ def test_fillna_iterable_category(self, named): Point = collections.namedtuple("Point", "x y") else: Point = lambda *args: args # tuple - cat = Categorical([Point(0, 0), Point(0, 1), None]) + cat = Categorical(np.array([Point(0, 0), Point(0, 1), None], dtype=object)) result = cat.fillna(Point(0, 0)) expected = Categorical([Point(0, 0), Point(0, 1), Point(0, 0)]) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index dc1f62c4c97c5..e0f3a4754221f 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -245,7 +245,9 @@ def test_take_non_na_fill_value(self, data_missing): fill_value = data_missing[1] # valid na = data_missing[0] - array = data_missing._from_sequence([na, fill_value, na]) + array = data_missing._from_sequence( + [na, fill_value, na], dtype=data_missing.dtype + ) result = array.take([-1, 1], fill_value=fill_value, allow_fill=True) expected = array.take([1, 1]) self.assert_extension_array_equal(result, expected) @@ -293,10 +295,12 @@ def test_reindex_non_na_fill_value(self, data_missing): valid = data_missing[1] na = data_missing[0] - array = data_missing._from_sequence([na, valid]) + array = data_missing._from_sequence([na, valid], dtype=data_missing.dtype) ser = pd.Series(array) result = ser.reindex([0, 1, 2], fill_value=valid) - expected = pd.Series(data_missing._from_sequence([na, valid, valid])) + expected = pd.Series( + data_missing._from_sequence([na, valid, valid], dtype=data_missing.dtype) + ) self.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 17bc2773aad19..a065c33689c78 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -113,6 +113,11 @@ def __setitem__(self, key, value): def __len__(self) -> int: return len(self.data) + def __array__(self, dtype=None): + if dtype is None: + dtype = object + return np.asarray(self.data, dtype=dtype) + @property def nbytes(self) -> int: return sys.getsizeof(self.data) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 4d3145109e3c2..dc03a1f1dcf72 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -163,10 +163,6 @@ def test_unstack(self, data, index): # this matches otherwise return super().test_unstack(data, index) - @pytest.mark.xfail(reason="Inconsistent sizes.") - def test_transpose(self, data): - super().test_transpose(data) - class TestGetitem(BaseJSON, base.BaseGetitemTests): pass From 8807ba16b356a829dee6a86e2865407d9b0492b0 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Wed, 22 Jan 2020 23:07:40 +0800 Subject: [PATCH 149/158] CLN: further cleaned code (GH26996) --- pandas/core/window/rolling.py | 13 ++----------- pandas/tests/window/moments/test_moments_rolling.py | 4 ++-- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 33a9405786050..08cbc5064d2db 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1182,22 +1182,13 @@ class _Rolling_and_Expanding(_Rolling): def count(self): blocks, obj = self._create_blocks() - - window = self._get_window() - window = min(window, len(obj)) if not self.center else window - - min_periods = self.min_periods - if min_periods is not None and not self.center: - # this is required as window is mutated above - min_periods = min(min_periods, window) - results = [] for b in blocks: result = b.notna().astype(int) result = self._constructor( result, - window=window, - min_periods=min_periods, + window=self._get_window(), + min_periods=self.min_periods, center=self.center, axis=self.axis, closed=self.closed, diff --git a/pandas/tests/window/moments/test_moments_rolling.py b/pandas/tests/window/moments/test_moments_rolling.py index 2de3c25b6d78e..83e4ee25558b5 100644 --- a/pandas/tests/window/moments/test_moments_rolling.py +++ b/pandas/tests/window/moments/test_moments_rolling.py @@ -835,8 +835,8 @@ def get_result(obj, window, min_periods=None, center=False): nan_mask = ~nan_mask tm.assert_almost_equal(result[nan_mask], expected[nan_mask]) else: - result = get_result(self.series, len(self.series) + 1) - expected = get_result(self.series, len(self.series)) + result = get_result(self.series, len(self.series) + 1, min_periods=0) + expected = get_result(self.series, len(self.series), min_periods=0) nan_mask = isna(result) tm.assert_series_equal(nan_mask, isna(expected)) From 118dc6cbc35c77f5fe6db2d7340d406fcf46567e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 22 Jan 2020 13:37:57 -0600 Subject: [PATCH 150/158] DOC: fixup whatsnew (#31217) --- doc/source/whatsnew/v1.1.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 59c90534beefd..c8e811ce82b1f 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -17,6 +17,7 @@ Enhancements Nonmonotonic PeriodIndex Partial String Slicing ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + :class:`PeriodIndex` now supports partial string slicing for non-monotonic indexes, mirroring :class:`DatetimeIndex` behavior (:issue:`31096`) For example: @@ -31,6 +32,7 @@ For example: ser .. ipython:: python + ser["2014"] ser.loc["May 2015"] From 774498b667e82bf6e826da44135a3ef99590ead6 Mon Sep 17 00:00:00 2001 From: Rik-de-Kort <32839123+Rik-de-Kort@users.noreply.github.com> Date: Wed, 22 Jan 2020 21:11:23 +0100 Subject: [PATCH 151/158] Follow-up: XLSB Support (#31215) --- doc/source/getting_started/install.rst | 2 +- pandas/compat/_optional.py | 2 +- pandas/tests/io/excel/test_readers.py | 5 ----- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index b5c512cdc8328..8f5900a2a1ba6 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -264,7 +264,7 @@ pyarrow 0.12.0 Parquet, ORC (requires 0.13.0), and pymysql 0.7.11 MySQL engine for sqlalchemy pyreadstat SPSS files (.sav) reading pytables 3.4.2 HDF5 reading / writing -pyxlsb 1.0.5 Reading for xlsb files +pyxlsb 1.0.6 Reading for xlsb files qtpy Clipboard I/O s3fs 0.3.0 Amazon S3 access tabulate 0.8.3 Printing in Markdown-friendly format (see `tabulate`_) diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index d561ab9a10548..cd711bcace013 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -19,7 +19,7 @@ "pyarrow": "0.13.0", "pytables": "3.4.2", "pytest": "5.0.1", - "pyxlsb": "1.0.5", + "pyxlsb": "1.0.6", "s3fs": "0.3.0", "scipy": "0.19.0", "sqlalchemy": "1.1.4", diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index f8ff3567b8b64..8d00ef1b7fe3e 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -562,11 +562,6 @@ def test_bad_engine_raises(self, read_ext): @tm.network def test_read_from_http_url(self, read_ext): - if read_ext == ".xlsb": - pytest.xfail("xlsb files not present in master repo yet") - if pd.read_excel.keywords["engine"] == "pyxlsb": - pytest.xfail("Sheets containing datetimes not supported by pyxlsb") - url = ( "https://raw.githubusercontent.com/pandas-dev/pandas/master/" "pandas/tests/io/data/excel/test1" + read_ext From 6f395ad4215cafe6c1fa237e6a018aa32264985d Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Thu, 23 Jan 2020 02:49:54 +0200 Subject: [PATCH 152/158] Replaced set comprehension with a generator (#31229) --- pandas/tests/io/parser/test_network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index b8d66874bc660..b7164477c31f2 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -205,7 +205,7 @@ def test_read_csv_chunked_download(self, s3_resource, caplog): with caplog.at_level(logging.DEBUG, logger="s3fs"): read_csv("s3://pandas-test/large-file.csv", nrows=5) # log of fetch_range (start, stop) - assert (0, 5505024) in {x.args[-2:] for x in caplog.records} + assert (0, 5505024) in (x.args[-2:] for x in caplog.records) def test_read_s3_with_hash_in_key(self, tips_df): # GH 25945 From e83a6bddac8c89b144dfe0783594dd332c5b3030 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 22 Jan 2020 22:26:32 -0600 Subject: [PATCH 153/158] BUG: Fixed upcast dtype for datetime64 in merge (#31211) --- pandas/core/internals/concat.py | 2 +- pandas/tests/reshape/merge/test_merge.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index c6f30ef65e9d5..c75373b82305c 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -350,7 +350,7 @@ def _get_empty_dtype_and_na(join_units): dtype = upcast_classes["datetimetz"] return dtype[0], tslibs.NaT elif "datetime" in upcast_classes: - return np.dtype("M8[ns]"), tslibs.iNaT + return np.dtype("M8[ns]"), np.datetime64("NaT", "ns") elif "timedelta" in upcast_classes: return np.dtype("m8[ns]"), np.timedelta64("NaT", "ns") else: # pragma diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 30c440035d48e..f9acf5b60a3cd 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2152,3 +2152,20 @@ def test_merge_multiindex_columns(): expected["id"] = "" tm.assert_frame_equal(result, expected) + + +def test_merge_datetime_upcast_dtype(): + # https://github.com/pandas-dev/pandas/issues/31208 + df1 = pd.DataFrame({"x": ["a", "b", "c"], "y": ["1", "2", "4"]}) + df2 = pd.DataFrame( + {"y": ["1", "2", "3"], "z": pd.to_datetime(["2000", "2001", "2002"])} + ) + result = pd.merge(df1, df2, how="left", on="y") + expected = pd.DataFrame( + { + "x": ["a", "b", "c"], + "y": ["1", "2", "4"], + "z": pd.to_datetime(["2000", "2001", "NaT"]), + } + ) + tm.assert_frame_equal(result, expected) From bcce1291eaf4a7ebd085d23dd0e0cd6c9ded2c3f Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Fri, 24 Jan 2020 14:20:15 +0800 Subject: [PATCH 154/158] TST: cleaned up tests (GH26996) --- pandas/tests/window/test_rolling.py | 32 +++++++++-------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 47429741164f3..1f4289bf6fad9 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -448,21 +448,16 @@ def test_min_periods1(): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("test_series", [True, False]) -def test_rolling_count_with_min_periods(test_series): +@pytest.mark.parametrize("constructor", [Series, DataFrame]) +def test_rolling_count_with_min_periods(constructor): # GH 26996 - if test_series: - result = Series(range(5)).rolling(3, min_periods=3).count() - expected = Series([np.nan, np.nan, 3.0, 3.0, 3.0]) - tm.assert_series_equal(result, expected) - else: - result = DataFrame(range(5)).rolling(3, min_periods=3).count() - expected = DataFrame([np.nan, np.nan, 3.0, 3.0, 3.0]) - tm.assert_frame_equal(result, expected) + result = constructor(range(5)).rolling(3, min_periods=3).count() + expected = constructor([np.nan, np.nan, 3.0, 3.0, 3.0]) + tm.assert_equal(result, expected) -@pytest.mark.parametrize("test_series", [True, False]) -def test_rolling_count_default_min_periods_with_null_values(test_series): +@pytest.mark.parametrize("constructor", [Series, DataFrame]) +def test_rolling_count_default_min_periods_with_null_values(constructor): # GH 26996 # We need rolling count to have default min_periods=0, # as the method is meant to count how many non-null values, @@ -471,13 +466,6 @@ def test_rolling_count_default_min_periods_with_null_values(test_series): values = [1, 2, 3, np.nan, 4, 5, 6] expected_counts = [np.nan, np.nan, 3.0, 2.0, 2.0, 2.0, 3.0] - if test_series: - ser = Series(values) - result = ser.rolling(3).count() - expected = Series(expected_counts) - tm.assert_series_equal(result, expected) - else: - df = DataFrame(values) - result = df.rolling(3).count() - expected = DataFrame(expected_counts) - tm.assert_frame_equal(result, expected) + result = constructor(values).rolling(3).count() + expected = constructor(expected_counts) + tm.assert_equal(result, expected) From 1a4352b5bcd4aa0eb01e25db21114bc66f434573 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Fri, 24 Jan 2020 15:28:38 +0800 Subject: [PATCH 155/158] BUG: changed min_periods default to 0 for rolling and expanding (GH26996) --- pandas/_libs/tslibs/timestamps.pyx | 3 +++ pandas/core/window/rolling.py | 2 +- pandas/tests/window/test_dtypes.py | 8 ++++---- pandas/tests/window/test_expanding.py | 20 ++++++++++++++++---- pandas/tests/window/test_rolling.py | 6 +----- 5 files changed, 25 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 36566b55e74ad..82ce99bbae285 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -375,11 +375,13 @@ class Timestamp(_Timestamp): # Mixing pydatetime positional and keyword arguments is forbidden! cdef _TSObject ts + print('haha') _date_attributes = [year, month, day, hour, minute, second, microsecond, nanosecond] if tzinfo is not None: + print('tzinfo not None') if not PyTZInfo_Check(tzinfo): # tzinfo must be a datetime.tzinfo object, GH#17690 raise TypeError( @@ -392,6 +394,7 @@ class Timestamp(_Timestamp): tz, tzinfo = tzinfo, None if isinstance(ts_input, str): + print('ts_input is str') # User passed a date string to parse. # Check that the user didn't also pass a date attribute kwarg. if any(arg is not None for arg in _date_attributes): diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 08cbc5064d2db..fc0431d01cba7 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1188,7 +1188,7 @@ def count(self): result = self._constructor( result, window=self._get_window(), - min_periods=self.min_periods, + min_periods=self.min_periods if self.min_periods else 0, center=self.center, axis=self.axis, closed=self.closed, diff --git a/pandas/tests/window/test_dtypes.py b/pandas/tests/window/test_dtypes.py index 35f93b1262f59..b1c9b66ab09d3 100644 --- a/pandas/tests/window/test_dtypes.py +++ b/pandas/tests/window/test_dtypes.py @@ -34,7 +34,7 @@ class Dtype: def get_expects(self): expects = { "sr1": { - "count": Series([np.nan, 2, 2, 2, 2], dtype="float64"), + "count": Series([1, 2, 2, 2, 2], dtype="float64"), "max": Series([np.nan, 1, 2, 3, 4], dtype="float64"), "min": Series([np.nan, 0, 1, 2, 3], dtype="float64"), "sum": Series([np.nan, 1, 3, 5, 7], dtype="float64"), @@ -44,7 +44,7 @@ def get_expects(self): "median": Series([np.nan, 0.5, 1.5, 2.5, 3.5], dtype="float64"), }, "sr2": { - "count": Series([np.nan, 2, 2, 2, 2], dtype="float64"), + "count": Series([1, 2, 2, 2, 2], dtype="float64"), "max": Series([np.nan, 10, 8, 6, 4], dtype="float64"), "min": Series([np.nan, 8, 6, 4, 2], dtype="float64"), "sum": Series([np.nan, 18, 14, 10, 6], dtype="float64"), @@ -54,7 +54,7 @@ def get_expects(self): "median": Series([np.nan, 9, 7, 5, 3], dtype="float64"), }, "sr3": { - "count": Series([np.nan, 2, 2, 1, 1], dtype="float64"), + "count": Series([1, 2, 2, 1, 1], dtype="float64"), "max": Series([np.nan, 1, 2, np.nan, np.nan], dtype="float64"), "min": Series([np.nan, 0, 1, np.nan, np.nan], dtype="float64"), "sum": Series([np.nan, 1, 3, np.nan, np.nan], dtype="float64"), @@ -67,7 +67,7 @@ def get_expects(self): }, "df": { "count": DataFrame( - {0: Series([np.nan, 2, 2, 2, 2]), 1: Series([np.nan, 2, 2, 2, 2])}, + {0: Series([1, 2, 2, 2, 2]), 1: Series([1, 2, 2, 2, 2])}, dtype="float64", ), "max": DataFrame( diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index 58ad20e473560..6b6367fd80b26 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -115,8 +115,20 @@ def test_expanding_axis(self, axis_frame): tm.assert_frame_equal(result, expected) -def test_expanding_count_with_min_periods(): +@pytest.mark.parametrize("constructor", [Series, DataFrame]) +def test_expanding_count_with_min_periods(constructor): # GH 26996 - result = Series(range(5)).expanding(min_periods=3).count() - expected = Series([np.nan, np.nan, 3.0, 4.0, 5.0]) - tm.assert_series_equal(result, expected) + result = constructor(range(5)).expanding(min_periods=3).count() + expected = constructor([np.nan, np.nan, 3.0, 4.0, 5.0]) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("constructor", [Series, DataFrame]) +def test_expanding_count_default_min_periods_with_null_values(constructor): + # GH 26996 + values = [1, 2, 3, np.nan, 4, 5, 6] + expected_counts = [1.0, 2.0, 3.0, 3.0, 4.0, 5.0, 6.0] + + result = constructor(values).expanding().count() + expected = constructor(expected_counts) + tm.assert_equal(result, expected) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 1f4289bf6fad9..ab2c7fcb7a0dc 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -459,12 +459,8 @@ def test_rolling_count_with_min_periods(constructor): @pytest.mark.parametrize("constructor", [Series, DataFrame]) def test_rolling_count_default_min_periods_with_null_values(constructor): # GH 26996 - # We need rolling count to have default min_periods=0, - # as the method is meant to count how many non-null values, - # we want to by default produce a valid count even if - # there are very few valid entries in the window values = [1, 2, 3, np.nan, 4, 5, 6] - expected_counts = [np.nan, np.nan, 3.0, 2.0, 2.0, 2.0, 3.0] + expected_counts = [1.0, 2.0, 3.0, 2.0, 2.0, 2.0, 3.0] result = constructor(values).rolling(3).count() expected = constructor(expected_counts) From 9fa453174bcd03f7a57d39b406cdef674a8269bb Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Fri, 24 Jan 2020 15:30:38 +0800 Subject: [PATCH 156/158] BUG: reverted non-relevant changes accidentally added (GH26996) --- pandas/_libs/tslibs/timestamps.pyx | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 82ce99bbae285..36566b55e74ad 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -375,13 +375,11 @@ class Timestamp(_Timestamp): # Mixing pydatetime positional and keyword arguments is forbidden! cdef _TSObject ts - print('haha') _date_attributes = [year, month, day, hour, minute, second, microsecond, nanosecond] if tzinfo is not None: - print('tzinfo not None') if not PyTZInfo_Check(tzinfo): # tzinfo must be a datetime.tzinfo object, GH#17690 raise TypeError( @@ -394,7 +392,6 @@ class Timestamp(_Timestamp): tz, tzinfo = tzinfo, None if isinstance(ts_input, str): - print('ts_input is str') # User passed a date string to parse. # Check that the user didn't also pass a date attribute kwarg. if any(arg is not None for arg in _date_attributes): From 987c033eb45889ccf83eb1590afb31d44012a14d Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Sat, 25 Jan 2020 09:40:38 +0800 Subject: [PATCH 157/158] BUG: small change in whatsnew (GH26996) --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 55c975a23f7ba..b06ed684cd525 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -1132,7 +1132,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` when using nunique on axis=1 (:issue:`30253`) - Bug in :meth:`GroupBy.quantile` with multiple list-like q value and integer column names (:issue:`30289`) - Bug in :meth:`GroupBy.pct_change` and :meth:`core.groupby.SeriesGroupBy.pct_change` causes ``TypeError`` when ``fill_method`` is ``None`` (:issue:`30463`) -- Bug in :meth:`Rolling.count` and :meth:`Expanding.count` argument ``min_periods`` ignored (:issue:`26996`) +- Bug in :meth:`Rolling.count` and :meth:`Expanding.count` argument where ``min_periods`` was ignored (:issue:`26996`) Reshaping ^^^^^^^^^ From a00532a8cd1b73c854e40819a7ca5dd2fa48fd25 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Sun, 26 Jan 2020 00:50:10 +0800 Subject: [PATCH 158/158] CLN: slight cleanup (GH26996) --- pandas/core/window/rolling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index fc0431d01cba7..580c7cc0554d0 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1188,7 +1188,7 @@ def count(self): result = self._constructor( result, window=self._get_window(), - min_periods=self.min_periods if self.min_periods else 0, + min_periods=self.min_periods or 0, center=self.center, axis=self.axis, closed=self.closed,